Load Libraries
library(RODBC) #To access MySQL database
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Connect to the MySQL database using odbc connect and a dsn
conn <- odbcConnect("MySQL_DSN")
Read the reviewers information from MySQL database into R dataframe
reviews <- sqlQuery(conn, "SELECT * FROM reviewers")
head(reviews, n=20)
## reviewer_id first_name last_name title rating
## 1 1 Ashe Denis Avengers 5
## 2 1 Ashe Denis Black Panther 4
## 3 1 Ashe Denis Something Borrowed 5
## 4 1 Ashe Denis Red 3
## 5 1 Ashe Denis Pure Haven 1
## 6 1 Ashe Denis Good Deed 5
## 7 2 Chinonye Onyeka Avengers 5
## 8 2 Chinonye Onyeka Black Panther 2
## 9 2 Chinonye Onyeka Something Borrowed 4
## 10 2 Chinonye Onyeka Red 5
## 11 2 Chinonye Onyeka Pure Haven 4
## 12 2 Chinonye Onyeka Good Deed 3
## 13 3 Kingsley Obi Avengers 4
## 14 3 Kingsley Obi Black Panther 4
## 15 3 Kingsley Obi Something Borrowed 4
## 16 3 Kingsley Obi Red 4
## 17 3 Kingsley Obi Pure Haven 4
## 18 3 Kingsley Obi Good Deed 4
## 19 4 Emmanuel Nnamdi Avengers 5
## 20 4 Emmanuel Nnamdi Black Panther 4
str(reviews)
## 'data.frame': 30 obs. of 5 variables:
## $ reviewer_id: int 1 1 1 1 1 1 2 2 2 2 ...
## $ first_name : chr "Ashe" "Ashe" "Ashe" "Ashe" ...
## $ last_name : chr "Denis" "Denis" "Denis" "Denis" ...
## $ title : chr "Avengers" "Black Panther" "Something Borrowed" "Red" ...
## $ rating : int 5 4 5 3 1 5 5 2 4 5 ...
Check if the dataframe contains any missing values
anyNA(reviews)
## [1] TRUE
The missing values are in the rating column
anyNA(reviews$rating)
## [1] TRUE
Missing Values : I omit missing values
# avg_rating <- round(mean(reviews$rating, na.rm = TRUE),0)
reviews_new <- reviews %>% na.omit()
Check for missingness in the new dataframe
anyNA(reviews_new)
## [1] FALSE
Check the head of the new dataframe
head(reviews_new, n = 20)
## reviewer_id first_name last_name title rating
## 1 1 Ashe Denis Avengers 5
## 2 1 Ashe Denis Black Panther 4
## 3 1 Ashe Denis Something Borrowed 5
## 4 1 Ashe Denis Red 3
## 5 1 Ashe Denis Pure Haven 1
## 6 1 Ashe Denis Good Deed 5
## 7 2 Chinonye Onyeka Avengers 5
## 8 2 Chinonye Onyeka Black Panther 2
## 9 2 Chinonye Onyeka Something Borrowed 4
## 10 2 Chinonye Onyeka Red 5
## 11 2 Chinonye Onyeka Pure Haven 4
## 12 2 Chinonye Onyeka Good Deed 3
## 13 3 Kingsley Obi Avengers 4
## 14 3 Kingsley Obi Black Panther 4
## 15 3 Kingsley Obi Something Borrowed 4
## 16 3 Kingsley Obi Red 4
## 17 3 Kingsley Obi Pure Haven 4
## 18 3 Kingsley Obi Good Deed 4
## 19 4 Emmanuel Nnamdi Avengers 5
## 20 4 Emmanuel Nnamdi Black Panther 4
Check the structure
str(reviews_new)
## 'data.frame': 27 obs. of 5 variables:
## $ reviewer_id: int 1 1 1 1 1 1 2 2 2 2 ...
## $ first_name : chr "Ashe" "Ashe" "Ashe" "Ashe" ...
## $ last_name : chr "Denis" "Denis" "Denis" "Denis" ...
## $ title : chr "Avengers" "Black Panther" "Something Borrowed" "Red" ...
## $ rating : int 5 4 5 3 1 5 5 2 4 5 ...
## - attr(*, "na.action")= 'omit' Named int [1:3] 25 27 28
## ..- attr(*, "names")= chr [1:3] "25" "27" "28"
Show all 6 videos
sqlQuery(conn, "SELECT * FROM videos")
## video_id title
## 1 1 Avengers
## 2 2 Black Panther
## 3 3 Something Borrowed
## 4 4 Red
## 5 5 Pure Haven
## 6 6 Good Deed