This document can serve as a reasonable start to your work on HW 8.
Note that you need to have all nine csv files in the same folder as this
HTML file. If you connected to the Sakila database remotely (the
intended way before connection issues), that’s fine.
Method 2: Create the database locally.
library(DBI)
library(dbplyr)
con <- dbConnect(drv = RSQLite::SQLite(),
dbname = ":memory:")
customer <- read.csv("customer.csv")
rental <- read.csv("rental.csv")
film <- read.csv("film.csv")
inventory <- read.csv("inventory.csv")
category <- read.csv("category.csv")
film_category <- read.csv("film_category.csv")
actor <- read.csv("actor.csv")
film_actor <- read.csv("film_actor.csv")
payment <- read.csv("payment.csv")
dbWriteTable(conn = con, name = "customer", value = customer)
dbWriteTable(conn = con, name = "rental", value = rental)
dbWriteTable(conn = con, name = "film", value = film)
dbWriteTable(conn = con, name = "inventory", value = inventory)
dbWriteTable(conn = con, name = "category", value = category)
dbWriteTable(conn = con, name = "film_category", value = film_category)
dbWriteTable(conn = con, name = "actor", value = actor)
dbWriteTable(conn = con, name = "film_actor", value = film_actor)
dbWriteTable(conn = con, name = "payment", value = payment)
knitr::opts_chunk$set(connection = "con")
dbListTables(con)
## [1] "actor" "category" "customer" "film"
## [5] "film_actor" "film_category" "inventory" "payment"
## [9] "rental"
Test Query
SELECT *
FROM actor
Displaying records 1 - 10
| 1 |
PENELOPE |
GUINESS |
2006-02-15 04:34:33 |
| 2 |
NICK |
WAHLBERG |
2006-02-15 04:34:33 |
| 3 |
ED |
CHASE |
2006-02-15 04:34:33 |
| 4 |
JENNIFER |
DAVIS |
2006-02-15 04:34:33 |
| 5 |
JOHNNY |
LOLLOBRIGIDA |
2006-02-15 04:34:33 |
| 6 |
BETTE |
NICHOLSON |
2006-02-15 04:34:33 |
| 7 |
GRACE |
MOSTEL |
2006-02-15 04:34:33 |
| 8 |
MATTHEW |
JOHANSSON |
2006-02-15 04:34:33 |
| 9 |
JOE |
SWANK |
2006-02-15 04:34:33 |
| 10 |
CHRISTIAN |
GABLE |
2006-02-15 04:34:33 |
SQL
Question 1
SELECT customer.first_name, customer.last_name, customer.email, COUNT(rental.rental_id) AS total_rentals
FROM customer
JOIN rental
ON customer.customer_id = rental.customer_id
GROUP BY customer.first_name, customer.last_name, customer.email
ORDER BY total_rentals DESC
LIMIT 10;
Question 2
SELECT film.title, film.description
FROM film
LEFT JOIN inventory
ON film.film_id = inventory.film_id
LEFT JOIN rental
ON inventory.inventory_id = rental.inventory_id
WHERE rental.rental_id IS NULL
GROUP BY film.title, film.description
ORDER BY film.title ASC;
Displaying records 1 - 10
| ACADEMY DINOSAUR |
A Epic Drama of a Feminist And a Mad Scientist who must
Battle a Teacher in The Canadian Rockies |
| ALICE FANTASIA |
A Emotional Drama of a A Shark And a Database
Administrator who must Vanquish a Pioneer in Soviet Georgia |
| APOLLO TEEN |
A Action-Packed Reflection of a Crocodile And a
Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft |
| ARGONAUTS TOWN |
A Emotional Epistle of a Forensic Psychologist And a
Butler who must Challenge a Waitress in An Abandoned Mine Shaft |
| ARK RIDGEMONT |
A Beautiful Yarn of a Pioneer And a Monkey who must
Pursue a Explorer in The Sahara Desert |
| ARSENIC INDEPENDENCE |
A Fanciful Documentary of a Mad Cow And a Womanizer who
must Find a Dentist in Berlin |
| BOONDOCK BALLROOM |
A Fateful Panorama of a Crocodile And a Boy who must
Defeat a Monkey in The Gulf of Mexico |
| BUTCH PANTHER |
A Lacklusture Yarn of a Feminist And a Database
Administrator who must Face a Hunter in New Orleans |
| CATCH AMISTAD |
A Boring Reflection of a Lumberjack And a Feminist who
must Discover a Woman in Nigeria |
| CHINATOWN GLADIATOR |
A Brilliant Panorama of a Technical Writer And a
Lumberjack who must Escape a Butler in Ancient India |
Question 3
SELECT category.name AS category, AVG(film.length) AS avg_length
FROM film
JOIN film_category ON film.film_id = film_category.film_id
JOIN category ON film_category.category_id = category.category_id
GROUP BY category.category_id
ORDER BY avg_length DESC;
Displaying records 1 - 10
| Sports |
128.2027 |
| Games |
127.8361 |
| Foreign |
121.6986 |
| Drama |
120.8387 |
| Comedy |
115.8276 |
| Family |
114.7826 |
| Music |
113.6471 |
| Travel |
113.3158 |
| Horror |
112.4821 |
| Classics |
111.6667 |
Question 4
SELECT actor.first_name, actor.last_name, COUNT(film_actor.film_id) AS number_films
FROM actor
JOIN film_actor
ON actor.actor_id = film_actor.actor_id
GROUP BY actor.first_name, actor.last_name
ORDER BY number_films DESC
LIMIT 5;
5 records
| SUSAN |
DAVIS |
54 |
| GINA |
DEGENERES |
42 |
| WALTER |
TORN |
41 |
| MARY |
KEITEL |
40 |
| MATTHEW |
CARREY |
39 |
Question 5
SELECT customer.first_name, customer.last_name
FROM customer
JOIN rental ON customer.customer_id = rental.customer_id
JOIN inventory ON rental.inventory_id = inventory.inventory_id
JOIN film_actor ON inventory.film_id = film_actor.film_id
JOIN actor ON film_actor.actor_id = actor.actor_id
WHERE actor.first_name = 'JOHNNY' AND actor.last_name = 'DEPP'
GROUP BY customer.first_name, customer.last_name;
Question 6
SELECT film.title, SUM(payment.amount) AS revenue
FROM payment
JOIN rental ON payment.rental_id = rental.rental_id
JOIN inventory ON rental.inventory_id = inventory.inventory_id
JOIN film ON inventory.film_id = film.film_id
GROUP BY film.title
ORDER BY revenue DESC
LIMIT 10;
Displaying records 1 - 10
| TELEGRAPH VOYAGE |
231.73 |
| WIFE TURN |
223.69 |
| ZORRO ARK |
214.69 |
| GOODFELLAS SALUTE |
209.69 |
| SATURDAY LAMBS |
204.72 |
| TITANS JERK |
201.71 |
| TORQUE BOUND |
198.72 |
| HARRY IDAHO |
195.70 |
| INNOCENT USUAL |
191.74 |
| HUSTLER PARTY |
190.78 |
Plotly
# load packages
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Question 1
diamond_plot <- plot_ly(
data = diamonds[sample(nrow(diamonds), 750), ],
x = ~carat,
y = ~price,
type = "scatter",
mode = "markers",
text = ~paste("Cut:", cut, "<br>Price: $", price),
hoverinfo = "text"
)
diamond_plot <- diamond_plot %>%
layout(
title = list(text = "Diamond Prices by Carat Weight<br><sup>From sample of 750 points</sup>"),
xaxis = list(title = "Carat Weight"),
yaxis = list(title = "Price (USD)")
)
diamond_plot
Question 2
ggplot2 provides static visualizations while plotly provides
interative visualizations. You would want to use ggplot2 if you are
making a plot that you will publish on paper, such as a paper. Plotly is
more fit for online distribution purposes, such as websites and online
dashboards. If you’re working with a large dataset, you would want to
use ggplot2 because plotly can be quite laggy with bigger datasets. When
I was working with the diamonds dataset for Question 1, I ended up
having to take a sample of the data because plotly could not handle the
sheer amount of data that was in the dataset.
Question 3
library(gapminder)
us_data <- gapminder %>% filter(country == "United States")
plot_ly(
data = us_data,
x = ~gdpPercap,
y = ~lifeExp,
type = 'scatter',
mode = 'lines+markers',
text = ~paste("Year:", year,
"<br>GDP per Capita: $", round(gdpPercap, 2),
"<br>Life Expectancy (yrs):", lifeExp),
marker = list(color = 'black', sizes = 20)
) %>%
layout(
xaxis = list(title = "GDP per Capita (USD)"),
yaxis = list(title = "Life Expectancy (Years)"),
title = list(text = "GDP per Capita vs Life Expectancy Over Time (United States)<br><sup>1952 - 2007</sup>")
)
This Plotly plot shows that over time, as life expectancy in the US
increased, so did GDP per capita (in USD). GDP and life expectance are
both seen as an indicator of the standard of living for a country. The
results of the plot are not surprising because as the overall wealth of
a country increases—demonstrated by the steady increase in GDP—the
standard of living is also increasing, which follows the pattern of the
increase in life expectancy.