Establish the connection

#Connection to SQL table and relevant libraries 
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.4.3
library(sqldf)
## Warning: package 'gsubfn' was built under R version 3.2.5
## Warning: package 'proto' was built under R version 3.2.5

Opening database connection

#Hide actual connection due to password 

#connection <- dbConnect(MySQL(), user='root', password='*******', 
                 #dbname='movies_db', host='127.0.0.1')

#dbListTables(connection)

Fetch the data & print out samples

#establish the connection and fetch the data from the tables 
imdb_info_rs <- dbSendQuery(connection , "select * from imdb_table")

imdb_info_df <- fetch(imdb_info_rs, n=-1)
imdb_info_df
Title_Generated_Number Title Year Votes_IMDB Rank_IMBD
573 The Shawshank Redemption 1994 1071904 1
574 The Godfather 1972 751381 2
575 The Godfather: Part II 1974 488889 3
576 Pulp Fiction 1994 830504 4
577 The Good, the Bad and the Ugly 1966 322961 5
ratings_rs <- dbSendQuery(connection , "select * from rating_dim")

rating_info_df <- fetch(ratings_rs, n=-1)
rating_info_df
Title_Generated_Number Liam_Ranking Meaghan_Ranking Mom_Ranking Heather_Ranking Dad_Ranking Average__Ranking
573 8 7 2 2 4 5
574 7 8 2 21 10 10
575 21 11 17 3 22 15
576 12 12 10 14 22 14
577 17 23 9 13 4 13
genre_rs <- dbSendQuery(connection , "select * from genre_dim")

genre_info_df <- fetch(genre_rs, n=-1)
genre_info_df
Title_Generated_Number genre
573 Crime
574 Drama
575 Drama
576 Comedy
577 Action
###Merge the data on “Tit le_Generated_Number“
comprehensive_rating<- merge(x = imdb_info_df, y = rating_info_df, by = "Title_Generated_Number", all = TRUE)
comprehensive_rating<- merge(x = comprehensive_rating, y = genre_info_df, by = "Title_Generated_Number", all = TRUE)


head(comprehensive_rating,10)
##    Title_Generated_Number                            Title Year Votes_IMDB
## 1                     573        The Shawshank Redemption  1994    1071904
## 2                     574                   The Godfather  1972     751381
## 3                     575          The Godfather: Part II  1974     488889
## 4                     576                    Pulp Fiction  1994     830504
## 5                     577  The Good, the Bad and the Ugly  1966     322961
## 6                     578                 The Dark Knight  2008    1045186
## 7                     579                    12 Angry Men  1957     264112
## 8                     580                Schindler's List  1993     545703
## 9                     581 The Lord of the Rings Fellowship 2003     758388
## 10                    582                      Fight Club  1999     814389
##    Rank_IMBD Liam_Ranking Meaghan_Ranking Mom_Ranking Heather_Ranking
## 1          1            8               7           2               2
## 2          2            7               8           2              21
## 3          3           21              11          17               3
## 4          4           12              12          10              14
## 5          5           17              23           9              13
## 6          6            1              14          24              28
## 7          7            9              20          20              26
## 8          8            2               3           7              30
## 9          9            6               4          22              14
## 10        10           18              18          28              20
##    Dad_Ranking Average__Ranking  genre
## 1            4                5  Crime
## 2           10               10  Drama
## 3           22               15  Drama
## 4           22               14 Comedy
## 5            4               13 Action
## 6           16               17 Action
## 7           27               20  Drama
## 8           21               13  Drama
## 9           26               14 Action
## 10          20               21 Comedy

Explore the data

#Display the summary statistics of "comprehensive_ratings""
summary(comprehensive_rating)
## Warning in summary(comprehensive_rating): bytecode version mismatch; using
## eval
##  Title_Generated_Number    Title               Year          
##  Min.   :573.0          Length:30          Length:30         
##  1st Qu.:580.2          Class :character   Class :character  
##  Median :587.5          Mode  :character   Mode  :character  
##  Mean   :587.5                                               
##  3rd Qu.:594.8                                               
##  Max.   :602.0                                               
##    Votes_IMDB        Rank_IMBD      Liam_Ranking   Meaghan_Ranking
##  Min.   : 145769   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.: 329696   1st Qu.: 8.25   1st Qu.: 8.25   1st Qu.: 8.25  
##  Median : 523694   Median :15.50   Median :15.50   Median :15.50  
##  Mean   : 545994   Mean   :15.50   Mean   :15.50   Mean   :15.50  
##  3rd Qu.: 756636   3rd Qu.:22.75   3rd Qu.:22.75   3rd Qu.:22.75  
##  Max.   :1071904   Max.   :30.00   Max.   :30.00   Max.   :30.00  
##   Mom_Ranking    Heather_Ranking  Dad_Ranking    Average__Ranking
##  Min.   : 1.00   Min.   : 2.00   Min.   : 1.00   Min.   : 5.00   
##  1st Qu.: 7.25   1st Qu.:10.25   1st Qu.: 5.25   1st Qu.:13.25   
##  Median :16.50   Median :15.50   Median :15.00   Median :15.50   
##  Mean   :14.83   Mean   :16.13   Mean   :14.50   Mean   :15.37   
##  3rd Qu.:22.00   3rd Qu.:21.00   3rd Qu.:21.75   3rd Qu.:18.00   
##  Max.   :28.00   Max.   :30.00   Max.   :30.00   Max.   :23.00   
##     genre          
##  Length:30         
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#Display the column names of "comprehensive_ratings"
names(comprehensive_rating)
##  [1] "Title_Generated_Number" "Title"                 
##  [3] "Year"                   "Votes_IMDB"            
##  [5] "Rank_IMBD"              "Liam_Ranking"          
##  [7] "Meaghan_Ranking"        "Mom_Ranking"           
##  [9] "Heather_Ranking"        "Dad_Ranking"           
## [11] "Average__Ranking"       "genre"
#Display the structure of "comprehensive_rating".
str(comprehensive_rating)
## 'data.frame':    30 obs. of  12 variables:
##  $ Title_Generated_Number: int  573 574 575 576 577 578 579 580 581 582 ...
##  $ Title                 : chr  "The Shawshank Redemption " "The Godfather " "The Godfather: Part II " "Pulp Fiction " ...
##  $ Year                  : chr  "1994" "1972" "1974" "1994" ...
##  $ Votes_IMDB            : int  1071904 751381 488889 830504 322961 1045186 264112 545703 758388 814389 ...
##  $ Rank_IMBD             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Liam_Ranking          : int  8 7 21 12 17 1 9 2 6 18 ...
##  $ Meaghan_Ranking       : int  7 8 11 12 23 14 20 3 4 18 ...
##  $ Mom_Ranking           : int  2 2 17 10 9 24 20 7 22 28 ...
##  $ Heather_Ranking       : int  2 21 3 14 13 28 26 30 14 20 ...
##  $ Dad_Ranking           : int  4 10 22 22 4 16 27 21 26 20 ...
##  $ Average__Ranking      : int  5 10 15 14 13 17 20 13 14 21 ...
##  $ genre                 : chr  "Crime" "Drama" "Drama" "Comedy" ...
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
# Basic barplot. Average family rank for the top imdb movies 
p<-ggplot(data=comprehensive_rating, aes(x=Title_Generated_Number, y=Average__Ranking)) +
  geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=Average__Ranking), vjust=-0.3, size=3.5)+
  theme_minimal()
p

# Horizontal bar plot
p + coord_flip()

# Basic scatter plot with regression line IMDB Rank VS Average family Rank
ggplot(comprehensive_rating, aes(x=Rank_IMBD, y=Average__Ranking)) + geom_point(size=2, shape=23)+
  geom_smooth(method=lm)

# Basic barplot. Count the number of films in each genre 
ggplot(data=comprehensive_rating,aes(x=genre)) + geom_bar(fill="darkred")

# Basic barplot. Count the number of films in each genre 
ggplot(data=comprehensive_rating,aes(x=Year)) + geom_bar(fill="green")