library(tidyverse)
## Warning: package 'dplyr' was built under R version 4.4.1
library(openintro)
## Warning: package 'openintro' was built under R version 4.4.1

Hiding login information:

library(rstudioapi)
username<-showPrompt(title='username', message='Enter username:', default="")
pw<-askForPassword(prompt="Enter password:")

###Forming DB connection

library(RMySQL)
## Warning: package 'RMySQL' was built under R version 4.4.1
## Loading required package: DBI
mydb <- dbConnect(MySQL(), user=username, password=pw, dbname=username, host="cunydata607sql.mysql.database.azure.com")
dbListTables(mydb)
## [1] "airlines" "airports" "flights"  "movie"    "movies"   "planes"   "weather"

Load the movies table into R dataframe

#write SQL command:
df<- dbGetQuery(mydb, "select * from movies")
df
##           Movies reviewer 1 reviewer 2 reviewer 3 reviewer 4 reviewer 5
## 1   Interstellar          5          2          2          5          5
## 2 Shutter Island          1          4          1          3          1
## 3  Spirited Away          2          1          2          5          3
## 4    The Minions          2          5          2          1          4
## 5      Toy Story          2          2          4          5          4
## 6      Your Name          5          2          4          5          4
##   reviewer 6
## 1          4
## 2          4
## 3          5
## 4          2
## 5          2
## 6          4

to view the dataframe summary

df<-df %>%
  mutate('averageScore'= floor(rowMeans(select(., where(is.numeric)), na.rm=TRUE)))
df 
##           Movies reviewer 1 reviewer 2 reviewer 3 reviewer 4 reviewer 5
## 1   Interstellar          5          2          2          5          5
## 2 Shutter Island          1          4          1          3          1
## 3  Spirited Away          2          1          2          5          3
## 4    The Minions          2          5          2          1          4
## 5      Toy Story          2          2          4          5          4
## 6      Your Name          5          2          4          5          4
##   reviewer 6 averageScore
## 1          4            3
## 2          4            2
## 3          5            3
## 4          2            2
## 5          2            3
## 6          4            4
library(ggplot2)
ggplot(df, aes(x = Movies, y = averageScore, color=Movies, fill=Movies, legend=FALSE)) +
  geom_bar(stat='identity') +
  labs(title = 'Average ratings from 6 reviews of the 6 popular movies', x='Movie names')+
  theme(legend.position = 'none')

#dbDisconnect()

Part 4: Missing data strategy

There are few approaches that i would take depending on the nature of the miss values. First, the proportion of the missing data is more than 50% of the columns total data, i might consider excluding the column from my analysis. Huge amount of missing data sometimes can be more misleading than providing meaningful insights Second, if the missing data is small (>20%) and is numerical, i will consider to use simple imputation or mean imputation to fill in the missing data with the column values’ mean. If the missing data are categorical, it will require addition analysis or even predictive model to try filling in the missing value

LS0tDQp0aXRsZTogIjYwNyBMYWIgMiBTUUwgREIgY29ubmVjdGlvbiINCmF1dGhvcjogIkNoaSBIYW5nIChQaGlsaXApIENoZXVuZyINCmRhdGU6ICJgciBTeXMuRGF0ZSgpYCINCm91dHB1dDogb3BlbmludHJvOjpsYWJfcmVwb3J0DQotLS0NCg0KYGBge3IgbG9hZC1wYWNrYWdlcywgbWVzc2FnZT1GQUxTRX0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShvcGVuaW50cm8pDQpgYGANCiMjIyBIaWRpbmcgbG9naW4gaW5mb3JtYXRpb246DQpgYGB7ciBpbmZvIGhpZGV9DQpsaWJyYXJ5KHJzdHVkaW9hcGkpDQp1c2VybmFtZTwtc2hvd1Byb21wdCh0aXRsZT0ndXNlcm5hbWUnLCBtZXNzYWdlPSdFbnRlciB1c2VybmFtZTonLCBkZWZhdWx0PSIiKQ0KcHc8LWFza0ZvclBhc3N3b3JkKHByb21wdD0iRW50ZXIgcGFzc3dvcmQ6IikNCmBgYA0KDQojIyNGb3JtaW5nIERCIGNvbm5lY3Rpb24NCg0KYGBge3IgY29kZS1jaHVuay1sYWJlbH0NCmxpYnJhcnkoUk15U1FMKQ0KbXlkYiA8LSBkYkNvbm5lY3QoTXlTUUwoKSwgdXNlcj11c2VybmFtZSwgcGFzc3dvcmQ9cHcsIGRibmFtZT11c2VybmFtZSwgaG9zdD0iY3VueWRhdGE2MDdzcWwubXlzcWwuZGF0YWJhc2UuYXp1cmUuY29tIikNCmRiTGlzdFRhYmxlcyhteWRiKQ0KYGBgDQoNCiMjIyBMb2FkIHRoZSBtb3ZpZXMgdGFibGUgaW50byBSIGRhdGFmcmFtZQ0KYGBge3J9DQojd3JpdGUgU1FMIGNvbW1hbmQ6DQpkZjwtIGRiR2V0UXVlcnkobXlkYiwgInNlbGVjdCAqIGZyb20gbW92aWVzIikNCmRmDQpgYGANCiMjIyB0byB2aWV3IHRoZSBkYXRhZnJhbWUgc3VtbWFyeQ0KYGBge3J9DQpkZjwtZGYgJT4lDQogIG11dGF0ZSgnYXZlcmFnZVNjb3JlJz0gZmxvb3Iocm93TWVhbnMoc2VsZWN0KC4sIHdoZXJlKGlzLm51bWVyaWMpKSwgbmEucm09VFJVRSkpKQ0KZGYgDQpgYGANCmBgYHtyIHBsb3R9DQpsaWJyYXJ5KGdncGxvdDIpDQpnZ3Bsb3QoZGYsIGFlcyh4ID0gTW92aWVzLCB5ID0gYXZlcmFnZVNjb3JlLCBjb2xvcj1Nb3ZpZXMsIGZpbGw9TW92aWVzLCBsZWdlbmQ9RkFMU0UpKSArDQogIGdlb21fYmFyKHN0YXQ9J2lkZW50aXR5JykgKw0KICBsYWJzKHRpdGxlID0gJ0F2ZXJhZ2UgcmF0aW5ncyBmcm9tIDYgcmV2aWV3cyBvZiB0aGUgNiBwb3B1bGFyIG1vdmllcycsIHg9J01vdmllIG5hbWVzJykrDQogIHRoZW1lKGxlZ2VuZC5wb3NpdGlvbiA9ICdub25lJykNCmBgYA0KDQpgYGB7ciBjbG9zZSBjb25uZWN0aW9ufQ0KI2RiRGlzY29ubmVjdCgpDQpgYGANCg0KIyMgUGFydCA0OiBNaXNzaW5nIGRhdGEgc3RyYXRlZ3kNClRoZXJlIGFyZSBmZXcgYXBwcm9hY2hlcyB0aGF0IGkgd291bGQgdGFrZSBkZXBlbmRpbmcgb24gdGhlIG5hdHVyZSBvZiB0aGUgbWlzcyB2YWx1ZXMuDQpGaXJzdCwgdGhlIHByb3BvcnRpb24gb2YgdGhlIG1pc3NpbmcgZGF0YSBpcyBtb3JlIHRoYW4gNTAlIG9mIHRoZSBjb2x1bW5zIHRvdGFsIGRhdGEsIGkgbWlnaHQgY29uc2lkZXIgZXhjbHVkaW5nIHRoZSBjb2x1bW4gZnJvbSBteSBhbmFseXNpcy4gSHVnZSBhbW91bnQgb2YgbWlzc2luZyBkYXRhIHNvbWV0aW1lcyBjYW4gYmUgbW9yZSBtaXNsZWFkaW5nIHRoYW4gcHJvdmlkaW5nIG1lYW5pbmdmdWwgaW5zaWdodHMNClNlY29uZCwgaWYgdGhlIG1pc3NpbmcgZGF0YSBpcyBzbWFsbCAoPjIwJSkgYW5kIGlzIG51bWVyaWNhbCwgaSB3aWxsIGNvbnNpZGVyIHRvIHVzZSBzaW1wbGUgaW1wdXRhdGlvbiBvciBtZWFuIGltcHV0YXRpb24gdG8gZmlsbCBpbiB0aGUgbWlzc2luZyBkYXRhIHdpdGggdGhlIGNvbHVtbiB2YWx1ZXMnIG1lYW4uDQpJZiB0aGUgbWlzc2luZyBkYXRhIGFyZSBjYXRlZ29yaWNhbCwgaXQgd2lsbCByZXF1aXJlIGFkZGl0aW9uIGFuYWx5c2lzIG9yIGV2ZW4gcHJlZGljdGl2ZSBtb2RlbCB0byB0cnkgZmlsbGluZyBpbiB0aGUgbWlzc2luZyB2YWx1ZQ==