This document can serve as a reasonable start to your work on HW 8. Note that you need to have all nine csv files in the same folder as this HTML file. If you connected to the Sakila database remotely (the intended way before connection issues), that’s fine.

Method 2: Create the database locally.

library(DBI)
library(dbplyr)
con <- dbConnect(drv = RSQLite::SQLite(),
                 dbname = ":memory:")

customer <- read.csv("customer.csv")
rental <- read.csv("rental.csv")
film <- read.csv("film.csv")
inventory <- read.csv("inventory.csv")
category <- read.csv("category.csv")
film_category <- read.csv("film_category.csv")
actor <- read.csv("actor.csv")
film_actor <- read.csv("film_actor.csv")
payment <- read.csv("payment.csv")

dbWriteTable(conn = con, name = "customer", value = customer)
dbWriteTable(conn = con, name = "rental", value = rental)
dbWriteTable(conn = con, name = "film", value = film)
dbWriteTable(conn = con, name = "inventory", value = inventory)
dbWriteTable(conn = con, name = "category", value = category)
dbWriteTable(conn = con, name = "film_category", value = film_category)
dbWriteTable(conn = con, name = "actor", value = actor)
dbWriteTable(conn = con, name = "film_actor", value = film_actor)
dbWriteTable(conn = con, name = "payment", value = payment)
knitr::opts_chunk$set(connection = "con")

dbListTables(con)

## [1] "actor"         "category"      "customer"      "film"         
## [5] "film_actor"    "film_category" "inventory"     "payment"      
## [9] "rental"

Question 1)

SELECT 
    first_name, 
    last_name, 
    email, 
    COUNT(*) AS total_rentals
FROM 
    customer
JOIN 
    rental ON customer.customer_id = rental.customer_id
GROUP BY 
    first_name, last_name, email
ORDER BY 
    total_rentals DESC
LIMIT 10;

Displaying records 1 - 10
first_name	last_name	email	total_rentals
ELEANOR	HUNT	ELEANOR.HUNT@sakilacustomer.org	46
KARL	SEAL	KARL.SEAL@sakilacustomer.org	45
CLARA	SHAW	CLARA.SHAW@sakilacustomer.org	42
MARCIA	DEAN	MARCIA.DEAN@sakilacustomer.org	42
TAMMY	SANDERS	TAMMY.SANDERS@sakilacustomer.org	41
SUE	PETERS	SUE.PETERS@sakilacustomer.org	40
WESLEY	BULL	WESLEY.BULL@sakilacustomer.org	40
MARION	SNYDER	MARION.SNYDER@sakilacustomer.org	39
RHONDA	KENNEDY	RHONDA.KENNEDY@sakilacustomer.org	39
TIM	CARY	TIM.CARY@sakilacustomer.org	39

Question 2)

SELECT 
    title, 
    description
FROM 
    film
LEFT JOIN 
    inventory ON film.film_id = inventory.film_id
LEFT JOIN 
    rental ON inventory.inventory_id = rental.inventory_id
WHERE 
    rental.rental_id IS NULL
ORDER BY 
    title;

Displaying records 1 - 10
title	description
ACADEMY DINOSAUR	A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies
ALICE FANTASIA	A Emotional Drama of a A Shark And a Database Administrator who must Vanquish a Pioneer in Soviet Georgia
APOLLO TEEN	A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft
ARGONAUTS TOWN	A Emotional Epistle of a Forensic Psychologist And a Butler who must Challenge a Waitress in An Abandoned Mine Shaft
ARK RIDGEMONT	A Beautiful Yarn of a Pioneer And a Monkey who must Pursue a Explorer in The Sahara Desert
ARSENIC INDEPENDENCE	A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin
BOONDOCK BALLROOM	A Fateful Panorama of a Crocodile And a Boy who must Defeat a Monkey in The Gulf of Mexico
BUTCH PANTHER	A Lacklusture Yarn of a Feminist And a Database Administrator who must Face a Hunter in New Orleans
CATCH AMISTAD	A Boring Reflection of a Lumberjack And a Feminist who must Discover a Woman in Nigeria
CHINATOWN GLADIATOR	A Brilliant Panorama of a Technical Writer And a Lumberjack who must Escape a Butler in Ancient India

Question 3)

SELECT 
    category.name AS category_name, 
    AVG(film.length) AS average_length
FROM 
    film
JOIN 
    film_category ON film.film_id = film_category.film_id
JOIN 
    category ON film_category.category_id = category.category_id
GROUP BY 
    category.name
ORDER BY 
    average_length DESC;

Displaying records 1 - 10
category_name	average_length
Sports	128.2027
Games	127.8361
Foreign	121.6986
Drama	120.8387
Comedy	115.8276
Family	114.7826
Music	113.6471
Travel	113.3158
Horror	112.4821
Classics	111.6667

#Question 4)

SELECT CONCAT(first_name, ' ', last_name) AS full_name,
  COUNT(film.film_id) AS number_films
FROM 
    actor, film_actor, film
WHERE 
    actor.actor_id = film_actor.actor_id
AND 
    film_actor.film_id = film.film_id
GROUP BY 
    actor.actor_id, actor.first_name, actor.last_name
ORDER BY 
    count(film.film_id) DESC
LIMIT 5;

5 records
full_name	number_films
GINA DEGENERES	42
WALTER TORN	41
MARY KEITEL	40
MATTHEW CARREY	39
SANDRA KILMER	37

#Question 5)

SELECT 
    DISTINCT customer.first_name, 
    customer.last_name
FROM 
    customer
JOIN 
    inventory ON customer.store_id = inventory.store_id
JOIN 
    film_actor ON inventory.film_id = film_actor.film_id
JOIN 
    actor ON film_actor.actor_id = actor.actor_id
WHERE 
    actor.first_name = 'JOHNNY' AND actor.last_name = 'DEPP';

0 records
first_name	last_name

Question 6)

SELECT 
    film.title, 
    SUM(payment.amount) AS total_revenue
FROM 
    film
JOIN 
    inventory ON film.film_id = inventory.film_id
JOIN 
    rental ON inventory.inventory_id = rental.inventory_id
JOIN 
    payment ON rental.rental_id = payment.rental_id
GROUP BY 
    film.title
ORDER BY 
    total_revenue DESC
LIMIT 10;

Displaying records 1 - 10
title	total_revenue
TELEGRAPH VOYAGE	231.73
WIFE TURN	223.69
ZORRO ARK	214.69
GOODFELLAS SALUTE	209.69
SATURDAY LAMBS	204.72
TITANS JERK	201.71
TORQUE BOUND	198.72
HARRY IDAHO	195.70
INNOCENT USUAL	191.74
HUSTLER PARTY	190.78

Plotly

Question 1)

library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot1 <- plot_ly(diamonds, x = ~carat, y = ~price, color = ~clarity,
        text = ~cut,
        hoverinfo = "text", type = 'scatter', mode = "markers") %>%
  layout(
    title = list(
      text = "Weight of the Diamond (Carat) vs. Price<br><sup>Subtitle: Diamond prices tend to increase when the weight of the diamond increases.</sup>",  
      x = 0.5,  # Center the title horizontally
      y = 0.95,  # Adjust the vertical position of the title
      font = list(size = 15),  # Set the font size
      align = "center"  # Align the title in the center
    ),
    xaxis = list(title = "Number of Carats"),
    yaxis = list(title = "Price (USD)")
  )
plot1

Question 2)

Both ggplot2 and plotly are effective libraries to create data visualizations, especially those that involve layered plot components. They are also both very customizable and can be used for a variety of different plots. However, they also have varying abilities which differentiate them. Firstly, ggplot is more frequently used with static plots, and requires additional tools to incorporate interactivity. In contast, plotly allows users to create dynamic and interactive plots. Next, ggplot2 is more focused in customization while, some argue, that plotly places a greater emphasis on interactivity rather than extensive customization. In terms of performance, ggplot2 performs well with a large number of data points, but it can be slower with extremely large datasets because it is static. Alternatively, plotly can handle large datasets, but complex animations and interactivity may cause performance issues if the dataset is too large. Finally, ggplot2 produces static outputs and is ideal for print publications or high quality image files. It can be exported as PNG, PDF, or SVG. Conversely, plotly’s outputs are interactive by default, but can be exported to static images. They are best for web applications, dashboards, and interactive presentations. These outputs are easily integrated into other web technologies.

If I were submitting a written, printed assignment, I would want to use ggplot2 visualizations because they are better suited for print. Alternatively, if I were creating a website and wanted users to better understand the plots contents and explore its abilities, I would opt for plotly graphics.

Question 3)

library(gapminder)
plot2 <- plot_ly(gapminder, x = ~pop, y = ~lifeExp, color = ~continent,
        text = ~country,
        hoverinfo = "text", type = 'scatter', mode = "markers", frame = ~year) %>%
  layout(
    title = list(
      text = "Population vs. Life Expectancy",  
      x = 0.5,  
      y = 0.95, 
      font = list(size = 20), 
      align = "center"
    ),
    xaxis = list(title = "Population"),
    yaxis = list(title = "Life Expectancy")
  )

plot2 <- plot2 %>% layout(
   updatemenus = list(
     list(
       type = "buttons",
       showactive = FALSE,
       x = 1,  
       y = 0,  
       buttons = list(
         
         list(
           label = "Pause",
           method = "animate",
          args = list(NULL, list(frame = list(duration = 0, 
                                               redraw = TRUE), 
                                  mode = "immediate"))
         )
       )
     )
   )
 )
plot2

In context, this dynamic scatterplot shows how various countries’ populations and life expectancies vary over time. More specifically, with the option to play this visualization, the viewer can see that the majority of the countries experience relatively small increases in their population overtime, and the life expectancy for all countries increases. China and India can be seen to deviate from this trend, as they move upward and dramatically to the right. This highlights a significant population boom and an increase in life expectancy. Another trend that is made visible by this visualization is the way in which the various regions tend to move in clusters. For example, the countries of Africa remain at the lower range of life expectancy ages and the European countries tend to be on the higher end of that same range. This graph illustrates the narrative that, generally speaking, life expectancy has risen amongst countries and population has changed less dramatically. Moreover, despite this improvement, the countries of Africa continue to lag behind other regions / countries.

HW 08

Method 2: Create the database locally.

Question 1)

Question 2)

Question 3)

Question 6)

Plotly

Question 1)

Question 2)

If I were submitting a written, printed assignment, I would want to use ggplot2 visualizations because they are better suited for print. Alternatively, if I were creating a website and wanted users to better understand the plots contents and explore its abilities, I would opt for plotly graphics.

Question 3)