This document can serve as a reasonable start to your work on HW 8.
Note that you need to have all nine csv files in the same folder as this
HTML file. If you connected to the Sakila database remotely (the
intended way before connection issues), that’s fine.
Method 2: Create the database locally.
library(DBI)
library(dbplyr)
con <- dbConnect(drv = RSQLite::SQLite(),
dbname = ":memory:")
setwd("/Users/alexsherman/Desktop/filesss")
customer <- read.csv("customer.csv")
rental <- read.csv("rental.csv")
film <- read.csv("film.csv")
inventory <- read.csv("inventory.csv")
category <- read.csv("category.csv")
film_category <- read.csv("film_category.csv")
actor <- read.csv("actor.csv")
film_actor <- read.csv("film_actor.csv")
payment <- read.csv("payment.csv")
dbWriteTable(conn = con, name = "customer", value = customer)
dbWriteTable(conn = con, name = "rental", value = rental)
dbWriteTable(conn = con, name = "film", value = film)
dbWriteTable(conn = con, name = "inventory", value = inventory)
dbWriteTable(conn = con, name = "category", value = category)
dbWriteTable(conn = con, name = "film_category", value = film_category)
dbWriteTable(conn = con, name = "actor", value = actor)
dbWriteTable(conn = con, name = "film_actor", value = film_actor)
dbWriteTable(conn = con, name = "payment", value = payment)
knitr::opts_chunk$set(connection = "con")
dbListTables(con)
## [1] "actor" "category" "customer" "film"
## [5] "film_actor" "film_category" "inventory" "payment"
## [9] "rental"
Test Query
SELECT *
FROM actor
Displaying records 1 - 10
| 1 |
PENELOPE |
GUINESS |
2006-02-15 04:34:33 |
| 2 |
NICK |
WAHLBERG |
2006-02-15 04:34:33 |
| 3 |
ED |
CHASE |
2006-02-15 04:34:33 |
| 4 |
JENNIFER |
DAVIS |
2006-02-15 04:34:33 |
| 5 |
JOHNNY |
LOLLOBRIGIDA |
2006-02-15 04:34:33 |
| 6 |
BETTE |
NICHOLSON |
2006-02-15 04:34:33 |
| 7 |
GRACE |
MOSTEL |
2006-02-15 04:34:33 |
| 8 |
MATTHEW |
JOHANSSON |
2006-02-15 04:34:33 |
| 9 |
JOE |
SWANK |
2006-02-15 04:34:33 |
| 10 |
CHRISTIAN |
GABLE |
2006-02-15 04:34:33 |
1)
SELECT customer.first_name,customer.last_name, customer.email,
COUNT(rental.rental_id) AS total_rentals
FROM customer
JOIN rental ON customer.customer_id = rental.customer_id
GROUP BY customer.customer_id, customer.first_name,customer.last_name, customer.email
ORDER BY total_rentals DESC
LIMIT 10;
2)
SELECT film.title, film.description
FROM film
LEFT JOIN inventory ON film.film_id = inventory.film_id
LEFT JOIN rental ON inventory.inventory_id = rental.inventory_id
WHERE rental.rental_id IS NULL
ORDER BY film.title;
Displaying records 1 - 10
| ACADEMY DINOSAUR |
A Epic Drama of a Feminist And a Mad Scientist who must
Battle a Teacher in The Canadian Rockies |
| ALICE FANTASIA |
A Emotional Drama of a A Shark And a Database
Administrator who must Vanquish a Pioneer in Soviet Georgia |
| APOLLO TEEN |
A Action-Packed Reflection of a Crocodile And a
Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft |
| ARGONAUTS TOWN |
A Emotional Epistle of a Forensic Psychologist And a
Butler who must Challenge a Waitress in An Abandoned Mine Shaft |
| ARK RIDGEMONT |
A Beautiful Yarn of a Pioneer And a Monkey who must
Pursue a Explorer in The Sahara Desert |
| ARSENIC INDEPENDENCE |
A Fanciful Documentary of a Mad Cow And a Womanizer who
must Find a Dentist in Berlin |
| BOONDOCK BALLROOM |
A Fateful Panorama of a Crocodile And a Boy who must
Defeat a Monkey in The Gulf of Mexico |
| BUTCH PANTHER |
A Lacklusture Yarn of a Feminist And a Database
Administrator who must Face a Hunter in New Orleans |
| CATCH AMISTAD |
A Boring Reflection of a Lumberjack And a Feminist who
must Discover a Woman in Nigeria |
| CHINATOWN GLADIATOR |
A Brilliant Panorama of a Technical Writer And a
Lumberjack who must Escape a Butler in Ancient India |
3)
SELECT category.name AS category_name, AVG(film.length) AS average_length
FROM film
JOIN film_category USING (film_id)
JOIN category USING (category_id)
GROUP BY category.name
ORDER BY average_length DESC;
Displaying records 1 - 10
| Sports |
128.2027 |
| Games |
127.8361 |
| Foreign |
121.6986 |
| Drama |
120.8387 |
| Comedy |
115.8276 |
| Family |
114.7826 |
| Music |
113.6471 |
| Travel |
113.3158 |
| Horror |
112.4821 |
| Classics |
111.6667 |
4)
SELECT actor.first_name, actor.last_name,
COUNT(film_actor.film_id) AS film_count
FROM actor
JOIN film_actor USING (actor_id)
GROUP BY actor.actor_id, actor.first_name, actor.last_name
ORDER BY film_count DESC
LIMIT 5;
5 records
| GINA |
DEGENERES |
42 |
| WALTER |
TORN |
41 |
| MARY |
KEITEL |
40 |
| MATTHEW |
CARREY |
39 |
| SANDRA |
KILMER |
37 |
5)
SELECT DISTINCT customer.first_name, customer.last_name
FROM customer
JOIN rental ON customer.customer_id = rental.customer_id
JOIN inventory ON rental.inventory_id = inventory.inventory_id
JOIN film ON inventory.film_id = film.film_id
JOIN film_actor ON film.film_id = film_actor.film_id
JOIN actor ON film_actor.actor_id = actor.actor_id
WHERE actor.first_name = 'JOHNNY' AND actor.last_name = 'DEPP';
6)
SELECT film.title,
SUM(payment.amount) AS total_revenue
FROM film
JOIN inventory ON film.film_id = inventory.film_id
JOIN rental ON inventory.inventory_id = rental.inventory_id
JOIN payment ON rental.rental_id = payment.rental_id
GROUP BY film.title
ORDER BY total_revenue DESC
LIMIT 10;
Displaying records 1 - 10
| TELEGRAPH VOYAGE |
231.73 |
| WIFE TURN |
223.69 |
| ZORRO ARK |
214.69 |
| GOODFELLAS SALUTE |
209.69 |
| SATURDAY LAMBS |
204.72 |
| TITANS JERK |
201.71 |
| TORQUE BOUND |
198.72 |
| HARRY IDAHO |
195.70 |
| INNOCENT USUAL |
191.74 |
| HUSTLER PARTY |
190.78 |
Plotly
1)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
plotly1 = plot_ly(data = diamonds, x = ~carat,y = ~price, color = ~cut, type = 'scatter', mode = 'markers', marker = list(size = 10)) %>%
layout(title = list(text = "Diamond Price vs Carat | >sup>Based on Different Cuts</sup>",
font = list(size = 18),x = 0.5),xaxis = list(title = "Carat"), yaxis = list(title = "Price"))
plotly1
2)
Plotly and ggplot are both great ways to show data visually. They can
both present data very efficiently, however plotly can take it a step
further an add an interactive element which is a step beyond the static
visuals that ggplot produces. A situation in which I would definitely
want to use plotly is when considering a scatter plot where the points
represent individual observations such as a unique movie or product. For
example, in the homework last week, the shiny app that I looked at
plotted movies based on box office and rotten tomatoes score. When more
than 10 to 20 observations are present, adding the title of the plots on
either a legend or as a point label would create significant clutter.
Instead, they used plotly, so all you had to do was hover over a point
to determine what movie it was. Another situation would be a heatmap or
a geographic heat map where the viewer would want to see more
information on each state or region, and they can do so by hovering.
GGplot would be better in situations where you want to show a general
trend and have a significant amount of information. A good example would
be the graphic above. Carats vs price from the diamonds dataset. Here
the viewer is more concerned about about the trend of how carats affect
price rather than looking at individual diamonds. While plotly works, it
is slow given the amount of data, and the interactive element doesn’t
provide that much value. Everything you need to know (carat, price, and
cut) is present. The viewer can get specific information for a signle
diamond if they so desire.
3)
library(gapminder)
animated_plot = plot_ly(data = gapminder, x = ~gdpPercap, y = ~lifeExp,
color = ~continent,frame = ~year, type = 'scatter',
marker = list(opacity = 0.7),
text = ~paste("Country:", country, "<br>Year:", year,
"<br>GDP per Capita:", gdpPercap, "<br>Life Expectancy:", lifeExp)) %>%
layout(title = "GDP per Capita vs Life Expectancy (1952 to 2007)", xaxis = list(title = "GDP per Capita"),
yaxis = list(title = "Life Expectancy"))
animated_plot
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
This visual allows us to see the relationship between GDP per capita
and life expectancy over the 55 years. The interactive element allows us
to hit play, and see how each country and continent change over the
years. Generally, as seen in the visual, as time progresses, GDP per
capita increases and so does life expectancy. This positive relationship
is due to the fact that when the economy is doing well and people are
contributing towards a countries increasing wealth, you see life
expectancy as a result increase. Likely, people are able to have better
food, shelter, health and healthcare, etc. The interactive element also
allows us to notice irregular movements of outliers such as Kuwait and
other