con <- dbConnect(drv = RSQLite::SQLite(),
dbname = ":memory:")
customer <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/customer.csv")
rental <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/rental.csv")
film <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/film.csv")
inventory <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/inventory.csv")
category <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/category.csv")
film_category <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/film_category.csv")
actor <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/actor.csv")
film_actor <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/film_actor.csv")
payment <- read.csv("/Users/sophiekemprecos/Desktop/3280/HW 09/HW 9 Data/payment.csv")
dbWriteTable(conn = con, name = "customer", value = customer)
dbWriteTable(conn = con, name = "rental", value = rental)
dbWriteTable(conn = con, name = "film", value = film)
dbWriteTable(conn = con, name = "inventory", value = inventory)
dbWriteTable(conn = con, name = "category", value = category)
dbWriteTable(conn = con, name = "film_category", value = film_category)
dbWriteTable(conn = con, name = "actor", value = actor)
dbWriteTable(conn = con, name = "film_actor", value = film_actor)
dbWriteTable(conn = con, name = "payment", value = payment)
knitr::opts_chunk$set(connection = "con")
dbListTables(con)
## [1] "actor" "category" "customer" "film"
## [5] "film_actor" "film_category" "inventory" "payment"
## [9] "rental"
SELECT
customer.first_name,
customer.last_name,
customer.email,
COUNT(rental.rental_id) AS total_rentals
FROM customer JOIN rental ON customer.customer_id = rental.customer_id
GROUP BY
customer.customer_id
ORDER BY
total_rentals DESC
LIMIT 10;
| first_name | last_name | total_rentals | |
|---|---|---|---|
| ELEANOR | HUNT | ELEANOR.HUNT@sakilacustomer.org | 46 |
| KARL | SEAL | KARL.SEAL@sakilacustomer.org | 45 |
| MARCIA | DEAN | MARCIA.DEAN@sakilacustomer.org | 42 |
| CLARA | SHAW | CLARA.SHAW@sakilacustomer.org | 42 |
| TAMMY | SANDERS | TAMMY.SANDERS@sakilacustomer.org | 41 |
| WESLEY | BULL | WESLEY.BULL@sakilacustomer.org | 40 |
| SUE | PETERS | SUE.PETERS@sakilacustomer.org | 40 |
| TIM | CARY | TIM.CARY@sakilacustomer.org | 39 |
| MARION | SNYDER | MARION.SNYDER@sakilacustomer.org | 39 |
| RHONDA | KENNEDY | RHONDA.KENNEDY@sakilacustomer.org | 39 |
SELECT
title, description
FROM film
LEFT JOIN inventory
ON film.film_id = inventory.film_id
LEFT JOIN rental
ON inventory.inventory_id = rental.inventory_id
WHERE rental.rental_id IS NULL
ORDER BY
film.title;
| title | description |
|---|---|
| ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies |
| ALICE FANTASIA | A Emotional Drama of a A Shark And a Database Administrator who must Vanquish a Pioneer in Soviet Georgia |
| APOLLO TEEN | A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft |
| ARGONAUTS TOWN | A Emotional Epistle of a Forensic Psychologist And a Butler who must Challenge a Waitress in An Abandoned Mine Shaft |
| ARK RIDGEMONT | A Beautiful Yarn of a Pioneer And a Monkey who must Pursue a Explorer in The Sahara Desert |
| ARSENIC INDEPENDENCE | A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin |
| BOONDOCK BALLROOM | A Fateful Panorama of a Crocodile And a Boy who must Defeat a Monkey in The Gulf of Mexico |
| BUTCH PANTHER | A Lacklusture Yarn of a Feminist And a Database Administrator who must Face a Hunter in New Orleans |
| CATCH AMISTAD | A Boring Reflection of a Lumberjack And a Feminist who must Discover a Woman in Nigeria |
| CHINATOWN GLADIATOR | A Brilliant Panorama of a Technical Writer And a Lumberjack who must Escape a Butler in Ancient India |
SELECT
category.name,
AVG(film.length) AS avg_length
FROM film
JOIN film_category
ON film.film_id = film_category.film_id
JOIN category
ON film_category.category_id = category.category_id
GROUP BY category.name
ORDER BY
avg_length DESC;
| name | avg_length |
|---|---|
| Sports | 128.2027 |
| Games | 127.8361 |
| Foreign | 121.6986 |
| Drama | 120.8387 |
| Comedy | 115.8276 |
| Family | 114.7826 |
| Music | 113.6471 |
| Travel | 113.3158 |
| Horror | 112.4821 |
| Classics | 111.6667 |
SELECT
actor.first_name, actor.last_name,
COUNT(film_actor.film_id) AS film_count
FROM actor
JOIN film_actor
ON actor.actor_id = film_actor.actor_id
GROUP BY actor.actor_id
ORDER BY film_count DESC
LIMIT 5;
| first_name | last_name | film_count |
|---|---|---|
| GINA | DEGENERES | 42 |
| WALTER | TORN | 41 |
| MARY | KEITEL | 40 |
| MATTHEW | CARREY | 39 |
| SANDRA | KILMER | 37 |
SELECT customer.first_name, customer.last_name
FROM customer
JOIN rental
ON customer.customer_id = rental.customer_id
JOIN inventory
ON rental.inventory_id = inventory.inventory_id
JOIN film
ON inventory.film_id = film.film_id
JOIN film_actor
ON film.film_id = film_actor.film_id
JOIN actor
ON film_actor.actor_id = actor.actor_id
WHERE actor.first_name = 'Johnny'
AND actor.last_name = 'Depp'
ORDER BY customer.last_name, customer.first_name;
| first_name | last_name |
|---|
SELECT film.title,
SUM(payment.amount) AS total_revenue
FROM film
JOIN inventory
ON film.film_id = inventory.film_id
JOIN rental
ON inventory.inventory_id = rental.inventory_id
JOIN payment
ON rental.rental_id = payment.rental_id
GROUP BY film.title
ORDER BY total_revenue DESC
LIMIT 10;
| title | total_revenue |
|---|---|
| TELEGRAPH VOYAGE | 231.73 |
| WIFE TURN | 223.69 |
| ZORRO ARK | 214.69 |
| GOODFELLAS SALUTE | 209.69 |
| SATURDAY LAMBS | 204.72 |
| TITANS JERK | 201.71 |
| TORQUE BOUND | 198.72 |
| HARRY IDAHO | 195.70 |
| INNOCENT USUAL | 191.74 |
| HUSTLER PARTY | 190.78 |
Use the diamonds dataset to create a 2D plotly scatterplot (you can choose the variables). The plot should have a title and subtitle, axis labels, and hover information that is useful.
The following plotly shows the relationship between carat level and price. We see this mapped as a scatterplot in blue showing a positive relationship between the two.
scat_plot <- plot_ly(data = diamonds, x = ~carat, y = ~price, marker = list(color = 'navyblue'))
scat_plot <- scat_plot %>%
layout(title = list(text = paste0("Relationship Between Carat and Price",
"<br><sup>Higher-carat diamonds tend to cost more</sup>")),
xaxis = list(title = 'Carat'), yaxis = list(title = 'Price'))
scat_plot
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Compare and contrast ggplot2 and plotly. What is an example of a time when you would definitely want to use each one over the other?
GGplot2 and Plotly have very similar functions. Both are sophisticated and easy to use software for creating descriptive plots in different formats. The above scatterplot using Plotly could be made very similarly using ggplot2 for a very close outcome. One major difference is that when there is a large amount of data being represented, Plotly allows you to see descriptive information for individual data points using the hover mechanism. For example, if you wanted to see the values of an outlier or a values in a mix of data points Plotly allows you to see specific values. When doing EDA the interactivitey is a very useful feature, however it does not knit into the pdf with the interactive features. This is one example where Plotly does not carry its advantages in some formats, so that is an important consideration for when to use ggplot or plotly. A good time to use plotly would also be when creating a dashboard. This is similar to Tableau, but you can make accessible dashboards or animations of plotly that can be shared.
Using the gapminder dataset, make a Plotly plot which compares two variables over time. Explain the meaning of this comparison in context and describe what narrative is advanced by this comparison.
The graph shows that as GDP per capita increases over time, life expectancy also tends to rise. These are not parallel trends but we do see significant changes over time. This is because better economic growth reflected by gdp per capita will also improve health outcomes and longevity of people. The narrative this advances is that economic development and people’s health are closely linked, where as a country becomes richer, people have better healthcare and can live longer. At the same time, the plot also suggests that gains in life expectancy may slow down at higher income levels, which may be due to diminishing returns from more income.
gap_data <- gapminder %>%
filter(country == "Indonesia")
gap_plot <- plot_ly()
gap_plot <- gap_plot %>%
add_trace(data = gap_data, x = ~year, y = ~lifeExp, name = "Life Expectancy", line = list(color = "pink")) %>%
add_trace(data = gap_data, x = ~year, y = ~gdpPercap/1000, name = "GDP Per Capita (thousands)", line = list(color = "purple")) %>%
layout(title = "Life Expectancy and GDP per Capita in Indonesia",
xaxis = list(title = "Year"),
yaxis = list(title = "Life Expectancy / GDP per Capita in thousands")) %>% style(visible = "legendonly", traces = 0)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
## A line object has been specified, but lines is not in the mode
## Adding lines to the mode...
gap_plot