Data 607 - SQL and R

General Overview

This assignment will utilize SQL and R. The files will be accessible from my github page which will be here.

Generating Movie List

Utilizing the following script will generate movies.sql.

movie_ids <- 1:20
movie_names <- c(
  "The Shawshank Redemption", "The Godfather", "The Dark Knight", 
  "Pulp Fiction", "Schindler's List", "Forrest Gump", 
  "Inception", "Fight Club", "The Matrix", 
  "Goodfellas", "The Empire Strikes Back", "Interstellar", 
  "Gladiator", "The Lord of the Rings: The Return of the King", 
  "Back to the Future", "Saving Private Ryan", 
  "The Silence of the Lambs", "Se7en", "The Usual Suspects", 
  "Jurassic Park"
)

movie_data <- data.frame(
  movieID = movie_ids,
  movieName = movie_names
)

sql_file_path <- "movies.sql"
conn <- file(sql_file_path, "w")

create_table_statement <- "
CREATE TABLE Movies (
  movieID INT PRIMARY KEY,
  movieName VARCHAR(255)
);
"
writeLines(create_table_statement, conn)

for (i in 1:nrow(movie_data)) {
  movie_name <- gsub("'", "''", movie_data$movieName[i])
  
  insert_query <- paste0(
    "INSERT INTO Movies (movieID, movieName) VALUES (",
    movie_data$movieID[i], ", '", movie_name, "');"
  )
  
  writeLines(insert_query, conn)
}

close(conn)

Generating Names

The following code snip will generate names.sql.

user_ids <- 1:20
user_names <- c(
  "Alice", "Bob", "Charlie", "David", "Eve", 
  "Frank", "Grace", "Hannah", "Ivy", "Jack", 
  "Karen", "Leo", "Mona", "Nathan", "Olivia", 
  "Paul", "Quincy", "Rachel", "Sam", "Tina"
)

user_data <- data.frame(
  userID = user_ids,
  userName = user_names
)

sql_file_path <- "names.sql"
conn <- file(sql_file_path, "w")

create_table_statement <- "
CREATE TABLE Users (
  userID INT PRIMARY KEY,
  userName VARCHAR(255)
);
"
writeLines(create_table_statement, conn)

for (i in 1:nrow(user_data)) {
  user_name <- gsub("'", "''", user_data$userName[i])
  
  insert_query <- paste0(
    "INSERT INTO Users (userID, userName) VALUES (",
    user_data$userID[i], ", '", user_name, "');"
  )
  
  writeLines(insert_query, conn)
}

close(conn)

Gathering Ratings

The following code snip will generate `ratings.sql``.

user_ids <- 1:20
movie_ids <- 1:20
set.seed(123)
ratings <- sample(1:5, 400, replace = TRUE)

rating_data <- expand.grid(userID = user_ids, movieID = movie_ids)
rating_data$rating <- ratings

sql_file_path <- "ratings.sql"
conn <- file(sql_file_path, "w")

create_table_statement <- "
CREATE TABLE Ratings (
  userID INT,
  movieID INT,
  rating INT,
  PRIMARY KEY (userID, movieID)
);
"
writeLines(create_table_statement, conn)

for (i in 1:nrow(rating_data)) {
  insert_query <- paste0(
    "INSERT INTO Ratings (userID, movieID, rating) VALUES (",
    rating_data$userID[i], ", ", 
    rating_data$movieID[i], ", ",
    rating_data$rating[i], ");"
  )
  
  writeLines(insert_query, conn)
}

close(conn)

Combining the Three SQL files in R

Next we will combine the three sql files in R to make it into one SQL file. If the table exists, then the table will be dropped.

names_sql <- readLines("names.sql")
movies_sql <- readLines("movies.sql")
ratings_sql <- readLines("ratings.sql")

drop_tables <- c(
  "DROP TABLE IF EXISTS Users;",
  "DROP TABLE IF EXISTS Movies;",
  "DROP TABLE IF EXISTS Ratings;"
)

combined_sql <- c(
  drop_tables,
  "", 
  names_sql,
  "",  
  movies_sql,
  "",  
  ratings_sql
)

combined_sql_file <- "movie_ratings.sql"
writeLines(combined_sql, combined_sql_file)

cat("Combined SQL file saved at:", combined_sql_file)

Accessing the MySQL Database

We will not access the Database after importing movie_ratings.sql in MySQL Workbench under the movie_ratings schema, and then preview the tables.

A config file was utilized instead of entering the password on the code chunk

Accessing Tables via MySQL in R

After accessing the database, creating a variable for each table to view it.

reviewers <- dbSendQuery(sdb, "SELECT * FROM users;")
head(dbFetch(reviewers))

##   userID userName
## 1      1    Alice
## 2      2      Bob
## 3      3  Charlie
## 4      4    David
## 5      5      Eve
## 6      6    Frank

movie <- dbSendQuery(sdb, "SELECT * FROM movies;")
head(dbFetch(movie))

##   movieID                movieName
## 1       1 The Shawshank Redemption
## 2       2            The Godfather
## 3       3          The Dark Knight
## 4       4             Pulp Fiction
## 5       5         Schindler's List
## 6       6             Forrest Gump

mratings <- dbSendQuery(sdb, "SELECT * FROM ratings;")
head(dbFetch(mratings))

##   userID movieID rating
## 1      1       1      3
## 2      1       2      2
## 3      1       3      5
## 4      1       4      5
## 5      1       5      4
## 6      1       6      5

After verifying the variables work, then combining the data is necessary to generate movieRatings

reviewers <- dbFetch(dbSendQuery(sdb, "SELECT * FROM users;"))
movies <- dbFetch(dbSendQuery(sdb, "SELECT * FROM movies;"))
ratings <- dbFetch(dbSendQuery(sdb, "SELECT * FROM ratings;"))

ratings_reviewers <- left_join(ratings, reviewers, by = "userID")
combined_data <- left_join(ratings_reviewers, movies, by = "movieID")

head(combined_data)

##   userID movieID rating userName                movieName
## 1      1       1      3    Alice The Shawshank Redemption
## 2      1       2      2    Alice            The Godfather
## 3      1       3      5    Alice          The Dark Knight
## 4      1       4      5    Alice             Pulp Fiction
## 5      1       5      4    Alice         Schindler's List
## 6      1       6      5    Alice             Forrest Gump

Analyzing the Data

After combining the data and taking the average, Inception and The Empire Strikes back are significantly equal in rating of 3.30, whereas The Matrix had the lowest rating of 2.25. After checking the graph, the same results are shown as well.

adata <- combined_data %>%
  filter(!is.na(rating)) %>%
  group_by(movieName) %>%
    summarise(arating = mean(as.numeric(rating))) %>%
  arrange(desc(arating))
  head(adata)

## # A tibble: 6 × 2
##   movieName               arating
##   <chr>                     <dbl>
## 1 Inception                  3.3 
## 2 The Empire Strikes Back    3.3 
## 3 The Usual Suspects         3.25
## 4 Schindler's List           3.2 
## 5 Forrest Gump               3.15
## 6 Gladiator                  3.15

adata <- combined_data %>%
  filter(!is.na(rating)) %>%
  group_by(movieName) %>%
    summarise(arating = mean(as.numeric(rating))) %>%
  arrange(arating)
  head(adata)

## # A tibble: 6 × 2
##   movieName                                     arating
##   <chr>                                           <dbl>
## 1 The Matrix                                       2.25
## 2 The Lord of the Rings: The Return of the King    2.7 
## 3 Interstellar                                     2.75
## 4 Jurassic Park                                    2.75
## 5 The Dark Knight                                  2.8 
## 6 The Shawshank Redemption                         2.8

adata %>%
  ggplot +
  geom_col(aes(arating,movieName))

Another approach would be to check which movie had the most rating counts rather than taking the average of the ratings for each movie:

cdata <- combined_data

cdata %>% group_by(movieName, rating) %>% summarise(count = n()) %>%
  arrange(desc(count))

## `summarise()` has grouped output by 'movieName'. You can override using the
## `.groups` argument.

## # A tibble: 98 × 3
## # Groups:   movieName [20]
##    movieName                rating count
##    <chr>                     <int> <int>
##  1 The Empire Strikes Back       5     9
##  2 The Matrix                    1     8
##  3 Fight Club                    5     7
##  4 Forrest Gump                  4     7
##  5 Inception                     5     7
##  6 Interstellar                  2     7
##  7 Se7en                         3     7
##  8 The Shawshank Redemption      3     7
##  9 Back to the Future            2     6
## 10 Fight Club                    2     6
## # ℹ 88 more rows

From the findings here The Matrix has about 8 ratings of 1, whereas The Empire Strikes Back has a rating of 5 about 9 times.