Data

con <- dbConnect(drv = RSQLite::SQLite(),
                 dbname = ":memory:")

customer <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/customer.csv")
rental <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/rental.csv")
film <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/film.csv")
inventory <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/inventory.csv")
category <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/category.csv")
film_category <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/film_category.csv")
actor <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/actor.csv")
film_actor <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/film_actor.csv")
payment <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/payment.csv")

dbWriteTable(conn = con, name = "customer", value = customer)
dbWriteTable(conn = con, name = "rental", value = rental)
dbWriteTable(conn = con, name = "film", value = film)
dbWriteTable(conn = con, name = "inventory", value = inventory)
dbWriteTable(conn = con, name = "category", value = category)
dbWriteTable(conn = con, name = "film_category", value = film_category)
dbWriteTable(conn = con, name = "actor", value = actor)
dbWriteTable(conn = con, name = "film_actor", value = film_actor)
dbWriteTable(conn = con, name = "payment", value = payment)
knitr::opts_chunk$set(connection = "con")

dbListTables(con)
## [1] "actor"         "category"      "customer"      "film"         
## [5] "film_actor"    "film_category" "inventory"     "payment"      
## [9] "rental"

SQL

  1. List the top 10 customers who have rented the most films, including their first and last names, email addresses, and the total number of films they have rented.
SELECT
customer.first_name,
customer.last_name,
customer.email,
COUNT(rental.rental_id) AS total_rentals
FROM customer JOIN rental ON customer.customer_id = rental.customer_id
GROUP BY
customer.customer_id
ORDER BY
total_rentals DESC
LIMIT 10;
Displaying records 1 - 10
first_name last_name email total_rentals
ELEANOR HUNT 46
KARL SEAL 45
MARCIA DEAN 42
CLARA SHAW 42
TAMMY SANDERS 41
WESLEY BULL 40
SUE PETERS 40
TIM CARY 39
MARION SNYDER 39
RHONDA KENNEDY 39
  1. Find the titles and descriptions of films that have never been rented, sorted alphabetically by title.
SELECT
title, description
FROM film
LEFT JOIN inventory
ON film.film_id = inventory.film_id
LEFT JOIN rental
ON inventory.inventory_id = rental.inventory_id
WHERE rental.rental_id IS NULL
ORDER BY
film.title;
Displaying records 1 - 10
title description
ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies
ALICE FANTASIA A Emotional Drama of a A Shark And a Database Administrator who must Vanquish a Pioneer in Soviet Georgia
APOLLO TEEN A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft
ARGONAUTS TOWN A Emotional Epistle of a Forensic Psychologist And a Butler who must Challenge a Waitress in An Abandoned Mine Shaft
ARK RIDGEMONT A Beautiful Yarn of a Pioneer And a Monkey who must Pursue a Explorer in The Sahara Desert
ARSENIC INDEPENDENCE A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin
BOONDOCK BALLROOM A Fateful Panorama of a Crocodile And a Boy who must Defeat a Monkey in The Gulf of Mexico
BUTCH PANTHER A Lacklusture Yarn of a Feminist And a Database Administrator who must Face a Hunter in New Orleans
CATCH AMISTAD A Boring Reflection of a Lumberjack And a Feminist who must Discover a Woman in Nigeria
CHINATOWN GLADIATOR A Brilliant Panorama of a Technical Writer And a Lumberjack who must Escape a Butler in Ancient India
  1. Retrieve a list of film categories and the average length of films in each category, ordered by the average length in descending order. Include the category name and the average length
SELECT
category.name,
AVG(film.length) AS avg_length
FROM film
JOIN film_category
ON film.film_id = film_category.film_id
JOIN category
ON film_category.category_id = category.category_id
GROUP BY category.name
ORDER BY
avg_length DESC;
Displaying records 1 - 10
name avg_length
Sports 128.2027
Games 127.8361
Foreign 121.6986
Drama 120.8387
Comedy 115.8276
Family 114.7826
Music 113.6471
Travel 113.3158
Horror 112.4821
Classics 111.6667
  1. Identify the 5 actors who have appeared in the most films, along with the number of films they’ve acted in. Include their first and last names.
SELECT
actor.first_name, actor.last_name,
COUNT(film_actor.film_id) AS film_count
FROM actor
JOIN film_actor
ON actor.actor_id = film_actor.actor_id
GROUP BY actor.actor_id
ORDER BY film_count DESC
LIMIT 5;
5 records
first_name last_name film_count
GINA DEGENERES 42
WALTER TORN 41
MARY KEITEL 40
MATTHEW CARREY 39
SANDRA KILMER 37
  1. List the names of customers who have rented at least one film starring the actor ’Johnny Depp.’
SELECT customer.first_name, customer.last_name
FROM customer
JOIN rental
ON customer.customer_id = rental.customer_id
JOIN inventory
ON rental.inventory_id = inventory.inventory_id
JOIN film
ON inventory.film_id = film.film_id
JOIN film_actor
ON film.film_id = film_actor.film_id
JOIN actor
ON film_actor.actor_id = actor.actor_id
WHERE actor.first_name = 'Johnny'
AND actor.last_name = 'Depp'
ORDER BY customer.last_name, customer.first_name;
0 records
first_name last_name
  1. Find the top 10 earning film titles along with their total revenue (rental fees), sorted by revenue in descending order. Include the film title and total revenue.**
SELECT film.title,
SUM(payment.amount) AS total_revenue
FROM film
JOIN inventory
ON film.film_id = inventory.film_id
JOIN rental
ON inventory.inventory_id = rental.inventory_id
JOIN payment
ON rental.rental_id = payment.rental_id
GROUP BY film.title
ORDER BY total_revenue DESC
LIMIT 10;
Displaying records 1 - 10
title total_revenue
TELEGRAPH VOYAGE 231.73
WIFE TURN 223.69
ZORRO ARK 214.69
GOODFELLAS SALUTE 209.69
SATURDAY LAMBS 204.72
TITANS JERK 201.71
TORQUE BOUND 198.72
HARRY IDAHO 195.70
INNOCENT USUAL 191.74
HUSTLER PARTY 190.78

Plotly

  1. Use the diamonds dataset to create a 2D plotly scatterplot (you can choose the variables). The plot should have a title and subtitle, axis labels, and hover information that is useful.

    The following plotly shows the relationship between carat level and price. We see this mapped as a scatterplot in blue showing a positive relationship between the two.

    scat_plot <- plot_ly(data = diamonds, x = ~carat, y = ~price, marker = list(color = 'navyblue'))
    scat_plot <- scat_plot %>% 
      layout(title = list(text = paste0("Relationship Between Carat and Price",
            "<br><sup>Higher-carat diamonds tend to cost more</sup>")), 
             xaxis = list(title = 'Carat'), yaxis = list(title = 'Price')) 
    scat_plot
    ## No trace type specified:
    ##   Based on info supplied, a 'scatter' trace seems appropriate.
    ##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
    ## No scatter mode specifed:
    ##   Setting the mode to markers
    ##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
  2. Compare and contrast ggplot2 and plotly. What is an example of a time when you would definitely want to use each one over the other?

GGplot2 and Plotly have very similar functions. Both are sophisticated and easy to use software for creating descriptive plots in different formats. The above scatterplot using Plotly could be made very similarly using ggplot2 for a very close outcome. One major difference is that when there is a large amount of data being represented, Plotly allows you to see descriptive information for individual data points using the hover mechanism. For example, if you wanted to see the values of an outlier or a values in a mix of data points Plotly allows you to see specific values. When doing EDA the interactivitey is a very useful feature, however it does not knit into the pdf with the interactive features. This is one example where Plotly does not carry its advantages in some formats, so that is an important consideration for when to use ggplot or plotly. A good time to use plotly would also be when creating a dashboard. This is similar to Tableau, but you can make accessible dashboards or animations of plotly that can be shared.

  1. Using the gapminder dataset, make a Plotly plot which compares two variables over time. Explain the meaning of this comparison in context and describe what narrative is advanced by this comparison.

    The graph shows that as GDP per capita increases over time, life expectancy also tends to rise. These are not parallel trends but we do see significant changes over time. This is because better economic growth reflected by gdp per capita will also improve health outcomes and longevity of people. The narrative this advances is that economic development and people’s health are closely linked, where as a country becomes richer, people have better healthcare and can live longer. At the same time, the plot also suggests that gains in life expectancy may slow down at higher income levels, which may be due to diminishing returns from more income.

    gap_data <- gapminder %>% 
      filter(country == "Indonesia")
    
    gap_plot <- plot_ly()
    gap_plot <- gap_plot %>%
      add_trace(data = gap_data, x = ~year, y = ~lifeExp, name = "Life Expectancy", line = list(color = "pink")) %>%
      add_trace(data = gap_data, x = ~year, y = ~gdpPercap/1000, name = "GDP Per Capita (thousands)", line = list(color = "purple")) %>%
      layout(title = "Life Expectancy and GDP per Capita in Indonesia",
        xaxis = list(title = "Year"),
        yaxis = list(title = "Life Expectancy / GDP per Capita in thousands")) %>% style(visible = "legendonly", traces = 0)
    ## No trace type specified:
    ##   Based on info supplied, a 'scatter' trace seems appropriate.
    ##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
    ## No scatter mode specifed:
    ##   Setting the mode to markers
    ##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
    ## No trace type specified:
    ##   Based on info supplied, a 'scatter' trace seems appropriate.
    ##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
    ## No scatter mode specifed:
    ##   Setting the mode to markers
    ##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
    ## A line object has been specified, but lines is not in the mode
    ## Adding lines to the mode...
    ## A line object has been specified, but lines is not in the mode
    ## Adding lines to the mode...
    gap_plot