Choose six recent popular movies. Ask at least five people that you know (friends, family, classmates, imaginary friends) to rate each of these movie that they have seen on a scale of 1 to 5. Take the results (observations) and store them in a SQL database. Load the information into an R dataframe.
Establish the connection
#Connection to SQL table and relevant libraries
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.4.3
library(sqldf)
## Warning: package 'gsubfn' was built under R version 3.2.5
## Warning: package 'proto' was built under R version 3.2.5
Opening database connection
#Hide actual connection due to password
#connection <- dbConnect(MySQL(), user='root', password='*******',
#dbname='movies_db', host='127.0.0.1')
#dbListTables(connection)
Fetch the data & print out samples
#establish the connection and fetch the data from the tables
imdb_info_rs <- dbSendQuery(connection , "select * from imdb_table")
imdb_info_df <- fetch(imdb_info_rs, n=-1)
imdb_info_df
| 573 |
The Shawshank Redemption |
1994 |
1071904 |
1 |
| 574 |
The Godfather |
1972 |
751381 |
2 |
| 575 |
The Godfather: Part II |
1974 |
488889 |
3 |
| 576 |
Pulp Fiction |
1994 |
830504 |
4 |
| 577 |
The Good, the Bad and the Ugly |
1966 |
322961 |
5 |
ratings_rs <- dbSendQuery(connection , "select * from rating_dim")
rating_info_df <- fetch(ratings_rs, n=-1)
rating_info_df
| 573 |
8 |
7 |
2 |
2 |
4 |
5 |
| 574 |
7 |
8 |
2 |
21 |
10 |
10 |
| 575 |
21 |
11 |
17 |
3 |
22 |
15 |
| 576 |
12 |
12 |
10 |
14 |
22 |
14 |
| 577 |
17 |
23 |
9 |
13 |
4 |
13 |
genre_rs <- dbSendQuery(connection , "select * from genre_dim")
genre_info_df <- fetch(genre_rs, n=-1)
genre_info_df
| 573 |
Crime |
| 574 |
Drama |
| 575 |
Drama |
| 576 |
Comedy |
| 577 |
Action |
| ###Merge the data on “Tit |
le_Generated_Number“ |
comprehensive_rating<- merge(x = imdb_info_df, y = rating_info_df, by = "Title_Generated_Number", all = TRUE)
comprehensive_rating<- merge(x = comprehensive_rating, y = genre_info_df, by = "Title_Generated_Number", all = TRUE)
head(comprehensive_rating,10)
## Title_Generated_Number Title Year Votes_IMDB
## 1 573 The Shawshank Redemption 1994 1071904
## 2 574 The Godfather 1972 751381
## 3 575 The Godfather: Part II 1974 488889
## 4 576 Pulp Fiction 1994 830504
## 5 577 The Good, the Bad and the Ugly 1966 322961
## 6 578 The Dark Knight 2008 1045186
## 7 579 12 Angry Men 1957 264112
## 8 580 Schindler's List 1993 545703
## 9 581 The Lord of the Rings Fellowship 2003 758388
## 10 582 Fight Club 1999 814389
## Rank_IMBD Liam_Ranking Meaghan_Ranking Mom_Ranking Heather_Ranking
## 1 1 8 7 2 2
## 2 2 7 8 2 21
## 3 3 21 11 17 3
## 4 4 12 12 10 14
## 5 5 17 23 9 13
## 6 6 1 14 24 28
## 7 7 9 20 20 26
## 8 8 2 3 7 30
## 9 9 6 4 22 14
## 10 10 18 18 28 20
## Dad_Ranking Average__Ranking genre
## 1 4 5 Crime
## 2 10 10 Drama
## 3 22 15 Drama
## 4 22 14 Comedy
## 5 4 13 Action
## 6 16 17 Action
## 7 27 20 Drama
## 8 21 13 Drama
## 9 26 14 Action
## 10 20 21 Comedy
Explore the data
#Display the summary statistics of "comprehensive_ratings""
summary(comprehensive_rating)
## Warning in summary(comprehensive_rating): bytecode version mismatch; using
## eval
## Title_Generated_Number Title Year
## Min. :573.0 Length:30 Length:30
## 1st Qu.:580.2 Class :character Class :character
## Median :587.5 Mode :character Mode :character
## Mean :587.5
## 3rd Qu.:594.8
## Max. :602.0
## Votes_IMDB Rank_IMBD Liam_Ranking Meaghan_Ranking
## Min. : 145769 Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.: 329696 1st Qu.: 8.25 1st Qu.: 8.25 1st Qu.: 8.25
## Median : 523694 Median :15.50 Median :15.50 Median :15.50
## Mean : 545994 Mean :15.50 Mean :15.50 Mean :15.50
## 3rd Qu.: 756636 3rd Qu.:22.75 3rd Qu.:22.75 3rd Qu.:22.75
## Max. :1071904 Max. :30.00 Max. :30.00 Max. :30.00
## Mom_Ranking Heather_Ranking Dad_Ranking Average__Ranking
## Min. : 1.00 Min. : 2.00 Min. : 1.00 Min. : 5.00
## 1st Qu.: 7.25 1st Qu.:10.25 1st Qu.: 5.25 1st Qu.:13.25
## Median :16.50 Median :15.50 Median :15.00 Median :15.50
## Mean :14.83 Mean :16.13 Mean :14.50 Mean :15.37
## 3rd Qu.:22.00 3rd Qu.:21.00 3rd Qu.:21.75 3rd Qu.:18.00
## Max. :28.00 Max. :30.00 Max. :30.00 Max. :23.00
## genre
## Length:30
## Class :character
## Mode :character
##
##
##
#Display the column names of "comprehensive_ratings"
names(comprehensive_rating)
## [1] "Title_Generated_Number" "Title"
## [3] "Year" "Votes_IMDB"
## [5] "Rank_IMBD" "Liam_Ranking"
## [7] "Meaghan_Ranking" "Mom_Ranking"
## [9] "Heather_Ranking" "Dad_Ranking"
## [11] "Average__Ranking" "genre"
#Display the structure of "comprehensive_rating".
str(comprehensive_rating)
## 'data.frame': 30 obs. of 12 variables:
## $ Title_Generated_Number: int 573 574 575 576 577 578 579 580 581 582 ...
## $ Title : chr "The Shawshank Redemption " "The Godfather " "The Godfather: Part II " "Pulp Fiction " ...
## $ Year : chr "1994" "1972" "1974" "1994" ...
## $ Votes_IMDB : int 1071904 751381 488889 830504 322961 1045186 264112 545703 758388 814389 ...
## $ Rank_IMBD : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Liam_Ranking : int 8 7 21 12 17 1 9 2 6 18 ...
## $ Meaghan_Ranking : int 7 8 11 12 23 14 20 3 4 18 ...
## $ Mom_Ranking : int 2 2 17 10 9 24 20 7 22 28 ...
## $ Heather_Ranking : int 2 21 3 14 13 28 26 30 14 20 ...
## $ Dad_Ranking : int 4 10 22 22 4 16 27 21 26 20 ...
## $ Average__Ranking : int 5 10 15 14 13 17 20 13 14 21 ...
## $ genre : chr "Crime" "Drama" "Drama" "Comedy" ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
# Basic barplot. Average family rank for the top imdb movies
p<-ggplot(data=comprehensive_rating, aes(x=Title_Generated_Number, y=Average__Ranking)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=Average__Ranking), vjust=-0.3, size=3.5)+
theme_minimal()
p

# Horizontal bar plot
p + coord_flip()

# Basic scatter plot with regression line IMDB Rank VS Average family Rank
ggplot(comprehensive_rating, aes(x=Rank_IMBD, y=Average__Ranking)) + geom_point(size=2, shape=23)+
geom_smooth(method=lm)

# Basic barplot. Count the number of films in each genre
ggplot(data=comprehensive_rating,aes(x=genre)) + geom_bar(fill="darkred")

# Basic barplot. Count the number of films in each genre
ggplot(data=comprehensive_rating,aes(x=Year)) + geom_bar(fill="green")
