library(tidyverse)
library(kableExtra)
library(RMySQL)
library(RODBC)
sql_conn <- dbConnect(MySQL(), user= usr, password = pwd, dbname='movies', host='localhost')
Please click here to access SQL script to create the two tables in database.
Read the rating table in r data frame
m_rating_tb = read.csv("https://raw.githubusercontent.com/Naik-Khyati/r_sql/main/input_data/m_rating.csv")
dbWriteTable(sql_conn,"m_rating",m_rating_tb, row.names = FALSE, overwrite = TRUE)
## [1] TRUE
m_rating_dt <- dbGetQuery(sql_conn, 'SELECT * FROM m_rating')
Read the movie name table in r data frame
m_name_tb = read.csv("https://raw.githubusercontent.com/Naik-Khyati/r_sql/main/input_data/m_name.csv")
dbWriteTable(sql_conn,"m_name",m_name_tb, row.names = FALSE, overwrite = TRUE)
## [1] TRUE
m_name_dt <- dbGetQuery(sql_conn, 'SELECT * FROM m_name')
m_rating_dt <- left_join(m_rating_dt, m_name_dt, by='movie_id')
m_rating_dt[is.na(m_rating_dt)] <- 0
m_rating_dt <- m_rating_dt %>% mutate(rating=pmax(rating1,rating2,rating3,rating4,rating5))
Note that rating=0 represents missing values. As shown in below table, there are 13 movies with missing rating in the dataframe.
m_rating_dt %>% filter (rating==0) %>% group_by(name) %>% summarise(Freq = n())
## # A tibble: 10 × 2
## name Freq
## <chr> <int>
## 1 Brian 2
## 2 Cindy 1
## 3 Eul 1
## 4 Greg 1
## 5 Jaya 1
## 6 Matt 1
## 7 Paul 1
## 8 Robert 1
## 9 Sara 1
## 10 Tim 3
There can be multiple ways to deal with missing values.
In this data frame, we are going to replace the missing values with average rating for corresponding respondent.
Below code snippet creates a data frame with average rating by the respondent considering only movies with non missing rating.
missing_values_rep <- m_rating_dt %>% filter (rating!=0) %>% group_by(name) %>% summarise(Freq = n(),avg_rating = min(5,ceiling(mean(rating)))) %>% ungroup()
m_rating_dt <- left_join(m_rating_dt, missing_values_rep, by='name')
m_rating_dt <- m_rating_dt %>%
mutate(final_rating = ifelse(rating == 0, avg_rating, rating))
table(m_rating_dt$final_rating)
##
## 1 2 3 4 5
## 15 14 13 21 15
Create aggregated data frame for movie name by average rating
chart_data <- m_rating_dt %>% group_by (movie_name) %>% summarise(avg_rating=mean(final_rating))%>%ungroup()
ggplot(chart_data, aes(x = movie_name, y = avg_rating, fill = movie_name)) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(avg_rating,digits=2)), vjust = 0) +
labs(title="Average Movie Rating", x= "Movie", y = "Average Rating") +
theme_void()+ theme(plot.title=element_text(hjust=0.5))