First, I will load necessary libraries:
library(DBI)
library(RMySQL)
library(dplyr)
library(ggplot2)
library(googlesheets4)
library(patchwork)
Here is the code to connect and create a table in MySQL database. My credentials are stored in my Windows environment variables.
# Credentials to access database from MySQL
db_host <- Sys.getenv("DB_HOST")
db_port <- as.integer(Sys.getenv("DB_PORT"))
db_name <- Sys.getenv("DB_NAME")
db_user <- Sys.getenv("DB_USER")
db_pass <- Sys.getenv("DB_PASS")
print(db_host)
## [1] "cuny607sql.mysql.database.azure.com"
# MySQL database connection
con <- dbConnect(
drv = RMySQL::MySQL(),
dbname = db_name,
host = db_host,
username = db_user,
password = db_pass
)
query <- paste("CREATE TABLE IF NOT EXISTS Movies_new1 (
Timestamp DATETIME,
Name CHAR(100),
Gladiator2 INT,
Wolfs INT,
The_Substance INT,
Bad_Boys4 INT,
The_Beekeeper INT,
Rebel_Ridge INT,
PRIMARY KEY (Name)
)", sep = "")
dbExecute(con, query)
## [1] 0
I created a Google Form survey to collect movie reviews. Here the link to the survey:
https://docs.google.com/forms/d/e/1FAIpQLScraIXSeT6wwZyp4QuA06m1aX5TPUDmccTSz1_a6D5mJWuLBg/viewform
The form is connected to a Google Sheet, which is publicly accessible and stores survey responses. Here is the link to the Google Sheets:
https://docs.google.com/spreadsheets/d/1x7uAbOozoNutaFOfbq_X8c33vzSXvqshJxZgu_MfJhI/edit?usp=sharing
This code chunk pulls the data from Google Sheets:
# URL of the publicly accessible "movie_reviews" Sheet
sheet_url <- sheet_url <- "https://docs.google.com/spreadsheets/d/1x7uAbOozoNutaFOfbq_X8c33vzSXvqshJxZgu_MfJhI/edit?usp=sharing"
sheet_name <- "Form Responses 1"
# Read the data from the sheet
data <- read_sheet(sheet_url, sheet = sheet_name)
glimpse(data)
## Rows: 9
## Columns: 8
## $ Timestamp <dttm> 2025-02-05 17:34:19, 2025-02-05 18:12:59, 2025-02-05 18…
## $ Name <chr> "Aiuna", "Ilya", "Vadim", "Vladimir Storchevoy", "Timur"…
## $ Gladiator2 <dbl> 1, 3, 1, 5, 5, 4, 3, 4, 3
## $ Wolfs <dbl> NA, 3, NA, 5, 5, 5, 5, 4, 4
## $ The_Substance <dbl> 5, 3, 3, 5, 4, 3, 4, NA, 4
## $ Bad_Boys4 <dbl> 2, 3, 1, 5, 5, 5, NA, 5, 5
## $ The_Beekeeper <dbl> 2, NA, 3, 4, 3, 4, 3, 3, 3
## $ Rebel_Ridge <dbl> 1, 3, 3, 4, 3, 4, NA, 3, 3
print(colnames(data))
## [1] "Timestamp" "Name" "Gladiator2" "Wolfs"
## [5] "The_Substance" "Bad_Boys4" "The_Beekeeper" "Rebel_Ridge"
This function iterates through the rows and pulls each value from column in the sheet’s data and assigns it into each value of the same column in MySQL database table:
# Iterating through each variable values row in data.
for (i in 1:nrow(data)) {
name <- data$Name[i]
gladiator2 <- data$Gladiator2[i]
wolfs <- data$Wolfs[i]
the_substance <- data$The_Substance[i]
bad_boys4 <- data$Bad_Boys4[i]
the_beekeeper <- data$The_Beekeeper[i]
rebel_ridge <- data$Rebel_Ridge[i]
#SQL code to insert values for same variables in each new row in MySQL database
query <- paste0(
"INSERT IGNORE INTO Movies_new1 (Name, Gladiator2, Wolfs, The_Substance, Bad_Boys4, The_Beekeeper, Rebel_Ridge) VALUES (",
"'", name, "', ",
"'", gladiator2, "', ",
"'", wolfs, "', ",
"'", the_substance, "', ",
"'", bad_boys4, "', ",
"'", the_beekeeper, "', ",
"'", rebel_ridge, "')"
)
dbExecute(con, query)
}
This chunk transfers data from the SQL database into R dataframe df:
# Query to select all data from the table
query <- "SELECT * FROM Movies_new1"
# Execute the query and fetch the results
df <- dbGetQuery(con, query)
# Summary of df
glimpse(df)
## Rows: 9
## Columns: 8
## $ Timestamp <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ Name <chr> "Aiuna", "Bob", "Gosha", "Ilya", "James", "Kirill", "Tim…
## $ Gladiator2 <int> 1, 3, 4, 3, 3, 4, 5, 1, 5
## $ Wolfs <int> 0, 5, 5, 3, 4, 4, 5, 0, 5
## $ The_Substance <int> 5, 4, 3, 3, 4, 0, 4, 3, 5
## $ Bad_Boys4 <int> 2, 0, 5, 3, 5, 5, 5, 1, 5
## $ The_Beekeeper <int> 2, 3, 4, 0, 3, 3, 3, 3, 4
## $ Rebel_Ridge <int> 1, 0, 4, 3, 3, 3, 3, 3, 4
summary(df)
## Timestamp Name Gladiator2 Wolfs
## Length:9 Length:9 Min. :1.000 Min. :0.000
## Class :character Class :character 1st Qu.:3.000 1st Qu.:3.000
## Mode :character Mode :character Median :3.000 Median :4.000
## Mean :3.222 Mean :3.444
## 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
## The_Substance Bad_Boys4 The_Beekeeper Rebel_Ridge
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :5.000 Median :3.000 Median :3.000
## Mean :3.444 Mean :3.444 Mean :2.778 Mean :2.667
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :4.000 Max. :4.000
Here I remove irrelevant Timestamp column and preview data:
#remove Timestamp column
df_zeroes_to_means <- df |> select(-Timestamp)
glimpse(df_zeroes_to_means)
## Rows: 9
## Columns: 7
## $ Name <chr> "Aiuna", "Bob", "Gosha", "Ilya", "James", "Kirill", "Tim…
## $ Gladiator2 <int> 1, 3, 4, 3, 3, 4, 5, 1, 5
## $ Wolfs <int> 0, 5, 5, 3, 4, 4, 5, 0, 5
## $ The_Substance <int> 5, 4, 3, 3, 4, 0, 4, 3, 5
## $ Bad_Boys4 <int> 2, 0, 5, 3, 5, 5, 5, 1, 5
## $ The_Beekeeper <int> 2, 3, 4, 0, 3, 3, 3, 3, 4
## $ Rebel_Ridge <int> 1, 0, 4, 3, 3, 3, 3, 3, 4
This dataset contains some reviews with value of zero (0). These zero values are not actual reviews of the movies, zero values indicate that a person didn’t watch this movie and didn’t rate it in survey.
This code replaces zero values in variables to the mean of the of the same variable:
# Replace zeros with the mean of each numeric column and round the values
df_zeroes_to_means <- df_zeroes_to_means |>
mutate(across(where(is.numeric), ~ {
column_mean <- round(mean(.x[.x != 0], na.rm = TRUE))
.x[.x == 0] <- column_mean
round(.x, 0)
}))
glimpse(df_zeroes_to_means)
## Rows: 9
## Columns: 7
## $ Name <chr> "Aiuna", "Bob", "Gosha", "Ilya", "James", "Kirill", "Tim…
## $ Gladiator2 <dbl> 1, 3, 4, 3, 3, 4, 5, 1, 5
## $ Wolfs <dbl> 4, 5, 5, 3, 4, 4, 5, 4, 5
## $ The_Substance <dbl> 5, 4, 3, 3, 4, 4, 4, 3, 5
## $ Bad_Boys4 <dbl> 2, 4, 5, 3, 5, 5, 5, 1, 5
## $ The_Beekeeper <dbl> 2, 3, 4, 3, 3, 3, 3, 3, 4
## $ Rebel_Ridge <dbl> 1, 3, 4, 3, 3, 3, 3, 3, 4
summary(df$Wolfs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 4.000 3.444 5.000 5.000
summary(df_zeroes_to_means$Wolfs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 4.000 4.000 4.333 5.000 5.000
The summaries above show different statistical properties before and after replacing zeroes with means.
The reasons why I decided to replace missing values with means:
# Write the data to a CSV file}
write.csv(data, "C:/CUNY_MSDS/DATA607/LAB2/Movie_Reviews.csv", row.names = FALSE)
# Disconnecting from MySQL
dbDisconnect(con)
## [1] TRUE
summary(df)
## Timestamp Name Gladiator2 Wolfs
## Length:9 Length:9 Min. :1.000 Min. :0.000
## Class :character Class :character 1st Qu.:3.000 1st Qu.:3.000
## Mode :character Mode :character Median :3.000 Median :4.000
## Mean :3.222 Mean :3.444
## 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
## The_Substance Bad_Boys4 The_Beekeeper Rebel_Ridge
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :5.000 Median :3.000 Median :3.000
## Mean :3.444 Mean :3.444 Mean :2.778 Mean :2.667
## 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :4.000 Max. :4.000
df_NA <- df |>
mutate((across(where(is.numeric), ~na_if(., 0))))
summary(df_NA)
## Timestamp Name Gladiator2 Wolfs
## Length:9 Length:9 Min. :1.000 Min. :3.000
## Class :character Class :character 1st Qu.:3.000 1st Qu.:4.000
## Mode :character Mode :character Median :3.000 Median :5.000
## Mean :3.222 Mean :4.429
## 3rd Qu.:4.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000
## NA's :2
## The_Substance Bad_Boys4 The_Beekeeper Rebel_Ridge
## Min. :3.000 Min. :1.000 Min. :2.000 Min. :1.00
## 1st Qu.:3.000 1st Qu.:2.750 1st Qu.:3.000 1st Qu.:3.00
## Median :4.000 Median :5.000 Median :3.000 Median :3.00
## Mean :3.875 Mean :3.875 Mean :3.125 Mean :3.00
## 3rd Qu.:4.250 3rd Qu.:5.000 3rd Qu.:3.250 3rd Qu.:3.25
## Max. :5.000 Max. :5.000 Max. :4.000 Max. :4.00
## NA's :1 NA's :1 NA's :1 NA's :1
p1 <- ggplot(df, aes(y = Wolfs)) +
geom_boxplot()
p2 <- ggplot(df_zeroes_to_means, aes(y = Wolfs)) +
geom_boxplot()
p3 <- ggplot(df, aes(x = Wolfs)) +
geom_histogram()
p4 <- ggplot(df_zeroes_to_means, aes(x = Wolfs)) +
geom_histogram()
p1 + p2 + p3 + p4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.