library(RMySQL)
## Loading required package: DBI
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
This extract the values for our rating by connecting to a local database in my machine, I provided an alternate csv method of loading the movie ratings table through csv, with the csv file.
#Here i will extract results from database
mysqlconnection = dbConnect(RMySQL::MySQL(), dbname='movie_ratings', host='localhost', port=3306, user='root', password='Uussd0vtusuZ')
dbListTables(mysqlconnection)
## [1] "ratings"
result = dbSendQuery(mysqlconnection, "select * from ratings")
data.frame = fetch(result, n = 10)
#Below all NA value are displayed
print(data.frame)
## id barbie past_lives the_holdovers anatomy_of_fall poor_things air name
## 1 1 3 5 5 1 NA 3 Jennifer
## 2 2 5 NA 2 3 4 NA Jonathan
## 3 3 4 2 3 NA 4 5 daglish
## 4 4 5 4 0 4 2 3 manolo
## 5 5 5 2 3 4 2 1 verde
## 6 6 1 3 5 5 NA 5 juaquin
## 7 7 4 NA NA 5 4 4 bandido
## 8 8 4 4 4 4 4 5 nelson
## 9 9 3 4 4 4 3 3 aaron
## 10 10 5 5 5 NA 5 5 john
As csv we extract null values and must convert them to NA in order for it to function just like reading it from a database would
data.frame <- read_csv("movie_ratings.csv")
## Rows: 10 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): past_lives, the_holdovers, anatomy_of_fall, poor_things, air, name
## dbl (2): id, barbie
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data.frame[data.frame == 'NULL'] <- NA
data.frame <- data.frame |> mutate_at(c("barbie", "past_lives", "the_holdovers", "anatomy_of_fall", "poor_things", "air"), as.numeric)
print(data.frame[2:7])
## # A tibble: 10 × 6
## barbie past_lives the_holdovers anatomy_of_fall poor_things air
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 5 5 1 NA 3
## 2 5 NA 2 3 4 NA
## 3 4 2 3 NA 4 5
## 4 5 4 0 4 2 3
## 5 5 2 3 4 2 1
## 6 1 3 5 5 NA 5
## 7 4 NA NA 5 4 4
## 8 4 4 4 4 4 5
## 9 3 4 4 4 3 3
## 10 5 5 5 NA 5 5
In each column we’d like to remove all NA values in order to calculate average rating based on how many people voted for the movie
data.frame.reduced <- data.frame |> select(barbie : air)
df <- tribble(~movie_name, ~average,)
kable(df)
| movie_name | average |
|---|
for (i in colnames(data.frame.reduced)){
df[nrow(df) + 1,] = list(i, mean( na.omit(data.frame[[i]])))
}
ggplot(df, aes(x = reorder(movie_name, -average), y = average, fill = movie_name)) + geom_bar(stat = "identity") + theme_ipsum()