DATA 607 Week 2 assignment

Part 1: Build Table

First, I will load necessary libraries:

library(DBI)
library(RMySQL)
library(dplyr)
library(ggplot2)
library(googlesheets4)
library(patchwork)

Here is the code to connect and create a table in MySQL database. My credentials are stored in my Windows environment variables.

# Credentials to access database from MySQL
db_host <- Sys.getenv("DB_HOST")
db_port <- as.integer(Sys.getenv("DB_PORT"))
db_name <- Sys.getenv("DB_NAME")
db_user <- Sys.getenv("DB_USER")
db_pass <- Sys.getenv("DB_PASS")
print(db_host)

## [1] "cuny607sql.mysql.database.azure.com"

# MySQL database connection
con <- dbConnect(
                 drv = RMySQL::MySQL(), 
                 dbname = db_name, 
                 host = db_host,
                 username = db_user,
                 password = db_pass
                 )
query <- paste("CREATE TABLE IF NOT EXISTS Movies_new1 (
              Timestamp DATETIME,
              Name CHAR(100),
              Gladiator2 INT,
              Wolfs INT,
              The_Substance INT,
              Bad_Boys4 INT,
              The_Beekeeper INT,
              Rebel_Ridge INT,
              PRIMARY KEY (Name)
              )", sep = "")

dbExecute(con, query)

## [1] 0

I created a Google Form survey to collect movie reviews. Here the link to the survey:

https://docs.google.com/forms/d/e/1FAIpQLScraIXSeT6wwZyp4QuA06m1aX5TPUDmccTSz1_a6D5mJWuLBg/viewform

The form is connected to a Google Sheet, which is publicly accessible and stores survey responses. Here is the link to the Google Sheets:

https://docs.google.com/spreadsheets/d/1x7uAbOozoNutaFOfbq_X8c33vzSXvqshJxZgu_MfJhI/edit?usp=sharing

This code chunk pulls the data from Google Sheets:

# URL of the publicly accessible "movie_reviews" Sheet
sheet_url <- sheet_url <- "https://docs.google.com/spreadsheets/d/1x7uAbOozoNutaFOfbq_X8c33vzSXvqshJxZgu_MfJhI/edit?usp=sharing"
sheet_name <- "Form Responses 1"

# Read the data from the sheet
data <- read_sheet(sheet_url, sheet = sheet_name)
glimpse(data)

## Rows: 9
## Columns: 8
## $ Timestamp     <dttm> 2025-02-05 17:34:19, 2025-02-05 18:12:59, 2025-02-05 18…
## $ Name          <chr> "Aiuna", "Ilya", "Vadim", "Vladimir Storchevoy", "Timur"…
## $ Gladiator2    <dbl> 1, 3, 1, 5, 5, 4, 3, 4, 3
## $ Wolfs         <dbl> NA, 3, NA, 5, 5, 5, 5, 4, 4
## $ The_Substance <dbl> 5, 3, 3, 5, 4, 3, 4, NA, 4
## $ Bad_Boys4     <dbl> 2, 3, 1, 5, 5, 5, NA, 5, 5
## $ The_Beekeeper <dbl> 2, NA, 3, 4, 3, 4, 3, 3, 3
## $ Rebel_Ridge   <dbl> 1, 3, 3, 4, 3, 4, NA, 3, 3

print(colnames(data))

## [1] "Timestamp"     "Name"          "Gladiator2"    "Wolfs"        
## [5] "The_Substance" "Bad_Boys4"     "The_Beekeeper" "Rebel_Ridge"

Part 2: Store data in SQL database.

This function iterates through the rows and pulls each value from column in the sheet’s data and assigns it into each value of the same column in MySQL database table:

# Iterating through each variable values row in data.
for (i in 1:nrow(data)) {
  name <- data$Name[i]
  gladiator2 <- data$Gladiator2[i]
  wolfs <- data$Wolfs[i]
  the_substance <- data$The_Substance[i]
  bad_boys4 <- data$Bad_Boys4[i]
  the_beekeeper <- data$The_Beekeeper[i]
  rebel_ridge <- data$Rebel_Ridge[i]
  
  #SQL code to insert values for same variables in each new row in MySQL database
  query <- paste0(
    "INSERT IGNORE INTO Movies_new1 (Name, Gladiator2, Wolfs, The_Substance, Bad_Boys4, The_Beekeeper, Rebel_Ridge) VALUES (",
    "'", name, "', ",
    "'", gladiator2, "', ",
    "'", wolfs, "', ",
    "'", the_substance, "', ",
    "'", bad_boys4, "', ",
    "'", the_beekeeper, "', ",
    "'", rebel_ridge, "')"
  )
  
  dbExecute(con, query)
}

Part 3: Transfer data from SQL database to R dataframe.

This chunk transfers data from the SQL database into R dataframe df:

# Query to select all data from the table
query <- "SELECT * FROM Movies_new1"
# Execute the query and fetch the results
df <- dbGetQuery(con, query)


# Summary of df
glimpse(df)

## Rows: 9
## Columns: 8
## $ Timestamp     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA
## $ Name          <chr> "Aiuna", "Bob", "Gosha", "Ilya", "James", "Kirill", "Tim…
## $ Gladiator2    <int> 1, 3, 4, 3, 3, 4, 5, 1, 5
## $ Wolfs         <int> 0, 5, 5, 3, 4, 4, 5, 0, 5
## $ The_Substance <int> 5, 4, 3, 3, 4, 0, 4, 3, 5
## $ Bad_Boys4     <int> 2, 0, 5, 3, 5, 5, 5, 1, 5
## $ The_Beekeeper <int> 2, 3, 4, 0, 3, 3, 3, 3, 4
## $ Rebel_Ridge   <int> 1, 0, 4, 3, 3, 3, 3, 3, 4

summary(df)

##   Timestamp             Name             Gladiator2        Wolfs      
##  Length:9           Length:9           Min.   :1.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.:3.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :4.000  
##                                        Mean   :3.222   Mean   :3.444  
##                                        3rd Qu.:4.000   3rd Qu.:5.000  
##                                        Max.   :5.000   Max.   :5.000  
##  The_Substance     Bad_Boys4     The_Beekeeper    Rebel_Ridge   
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :5.000   Median :3.000   Median :3.000  
##  Mean   :3.444   Mean   :3.444   Mean   :2.778   Mean   :2.667  
##  3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :4.000   Max.   :4.000

Part 4: Missing data strategy

Here I remove irrelevant Timestamp column and preview data:

#remove Timestamp column

df_zeroes_to_means <- df |> select(-Timestamp)
glimpse(df_zeroes_to_means)

## Rows: 9
## Columns: 7
## $ Name          <chr> "Aiuna", "Bob", "Gosha", "Ilya", "James", "Kirill", "Tim…
## $ Gladiator2    <int> 1, 3, 4, 3, 3, 4, 5, 1, 5
## $ Wolfs         <int> 0, 5, 5, 3, 4, 4, 5, 0, 5
## $ The_Substance <int> 5, 4, 3, 3, 4, 0, 4, 3, 5
## $ Bad_Boys4     <int> 2, 0, 5, 3, 5, 5, 5, 1, 5
## $ The_Beekeeper <int> 2, 3, 4, 0, 3, 3, 3, 3, 4
## $ Rebel_Ridge   <int> 1, 0, 4, 3, 3, 3, 3, 3, 4

This dataset contains some reviews with value of zero (0). These zero values are not actual reviews of the movies, zero values indicate that a person didn’t watch this movie and didn’t rate it in survey.

This code replaces zero values in variables to the mean of the of the same variable:

# Replace zeros with the mean of each numeric column and round the values
df_zeroes_to_means <- df_zeroes_to_means |>
  mutate(across(where(is.numeric), ~ {
    column_mean <- round(mean(.x[.x != 0], na.rm = TRUE))
    .x[.x == 0] <- column_mean
    round(.x, 0)
  }))

glimpse(df_zeroes_to_means)

## Rows: 9
## Columns: 7
## $ Name          <chr> "Aiuna", "Bob", "Gosha", "Ilya", "James", "Kirill", "Tim…
## $ Gladiator2    <dbl> 1, 3, 4, 3, 3, 4, 5, 1, 5
## $ Wolfs         <dbl> 4, 5, 5, 3, 4, 4, 5, 4, 5
## $ The_Substance <dbl> 5, 4, 3, 3, 4, 4, 4, 3, 5
## $ Bad_Boys4     <dbl> 2, 4, 5, 3, 5, 5, 5, 1, 5
## $ The_Beekeeper <dbl> 2, 3, 4, 3, 3, 3, 3, 3, 4
## $ Rebel_Ridge   <dbl> 1, 3, 4, 3, 3, 3, 3, 3, 4

summary(df$Wolfs)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   4.000   3.444   5.000   5.000

summary(df_zeroes_to_means$Wolfs)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   4.000   4.000   4.333   5.000   5.000

The summaries above show different statistical properties before and after replacing zeroes with means.

The reasons why I decided to replace missing values with means:

It can preserve statistical properties of the dataset.
It reduces bias and prevent skewing analysis results.
Variable with some missing values can effect the mean of that variable.
Data is more consistent and easier to interpret.
Missing and NA values can cause wrong charts and graphs. This is the reason I didn’t replace zeroes with NAs.
I didn’t want to remove rows with zeroes, because it can cause loss of valuable information.

# Write the data to a CSV file}
write.csv(data, "C:/CUNY_MSDS/DATA607/LAB2/Movie_Reviews.csv", row.names = FALSE)



# Disconnecting from MySQL
dbDisconnect(con)

## [1] TRUE

summary(df)

##   Timestamp             Name             Gladiator2        Wolfs      
##  Length:9           Length:9           Min.   :1.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.:3.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :4.000  
##                                        Mean   :3.222   Mean   :3.444  
##                                        3rd Qu.:4.000   3rd Qu.:5.000  
##                                        Max.   :5.000   Max.   :5.000  
##  The_Substance     Bad_Boys4     The_Beekeeper    Rebel_Ridge   
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :5.000   Median :3.000   Median :3.000  
##  Mean   :3.444   Mean   :3.444   Mean   :2.778   Mean   :2.667  
##  3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :4.000   Max.   :4.000

df_NA <- df |>
  mutate((across(where(is.numeric), ~na_if(., 0))))
summary(df_NA)

##   Timestamp             Name             Gladiator2        Wolfs      
##  Length:9           Length:9           Min.   :1.000   Min.   :3.000  
##  Class :character   Class :character   1st Qu.:3.000   1st Qu.:4.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :5.000  
##                                        Mean   :3.222   Mean   :4.429  
##                                        3rd Qu.:4.000   3rd Qu.:5.000  
##                                        Max.   :5.000   Max.   :5.000  
##                                                        NA's   :2      
##  The_Substance     Bad_Boys4     The_Beekeeper    Rebel_Ridge  
##  Min.   :3.000   Min.   :1.000   Min.   :2.000   Min.   :1.00  
##  1st Qu.:3.000   1st Qu.:2.750   1st Qu.:3.000   1st Qu.:3.00  
##  Median :4.000   Median :5.000   Median :3.000   Median :3.00  
##  Mean   :3.875   Mean   :3.875   Mean   :3.125   Mean   :3.00  
##  3rd Qu.:4.250   3rd Qu.:5.000   3rd Qu.:3.250   3rd Qu.:3.25  
##  Max.   :5.000   Max.   :5.000   Max.   :4.000   Max.   :4.00  
##  NA's   :1       NA's   :1       NA's   :1       NA's   :1

p1 <- ggplot(df, aes(y = Wolfs)) +
  geom_boxplot()
  
p2 <- ggplot(df_zeroes_to_means, aes(y = Wolfs)) +
  geom_boxplot()

p3 <- ggplot(df, aes(x = Wolfs)) +
  geom_histogram()

p4 <- ggplot(df_zeroes_to_means, aes(x = Wolfs)) +
  geom_histogram()

p1 + p2 + p3 + p4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

DATA 607 Week 2 assignment

Farhod Ibragimov

2025-02-09

Part 1: Build Table

Part 2: Store data in SQL database.

Part 3: Transfer data from SQL database to R dataframe.

Part 4: Missing data strategy