con <- dbConnect(drv = RSQLite::SQLite(),
dbname = ":memory:")
customer <- read.csv("customer.csv")
rental <- read.csv("rental.csv")
film <- read.csv("film.csv")
inventory <- read.csv("inventory.csv")
category <- read.csv("category.csv")
film_category <- read.csv("film_category.csv")
actor <- read.csv("actor.csv")
film_actor <- read.csv("film_actor.csv")
payment <- read.csv("payment.csv")
dbWriteTable(conn = con, name = "customer", value = customer)
dbWriteTable(conn = con, name = "rental", value = rental)
dbWriteTable(conn = con, name = "film", value = film)
dbWriteTable(conn = con, name = "inventory", value = inventory)
dbWriteTable(conn = con, name = "category", value = category)
dbWriteTable(conn = con, name = "film_category", value = film_category)
dbWriteTable(conn = con, name = "actor", value = actor)
dbWriteTable(conn = con, name = "film_actor", value = film_actor)
dbWriteTable(conn = con, name = "payment", value = payment)
knitr::opts_chunk$set(connection = "con")
dbListTables(con)
## [1] "actor" "category" "customer" "film"
## [5] "film_actor" "film_category" "inventory" "payment"
## [9] "rental"
SELECT first_name, last_name, email, COUNT(*)
FROM customer JOIN rental USING (customer_id)
GROUP BY first_name
ORDER BY COUNT(*) DESC
LIMIT 10
| first_name | last_name | COUNT(*) | |
|---|---|---|---|
| MARION | SNYDER | MARION.SNYDER@sakilacustomer.org | 68 |
| JESSIE | BANKS | JESSIE.BANKS@sakilacustomer.org | 59 |
| TRACY | COLE | TRACY.COLE@sakilacustomer.org | 58 |
| LESLIE | GORDON | LESLIE.GORDON@sakilacustomer.org | 57 |
| JAMIE | RICE | JAMIE.RICE@sakilacustomer.org | 54 |
| WILLIE | HOWELL | WILLIE.HOWELL@sakilacustomer.org | 51 |
| TERRY | CARLSON | TERRY.CARLSON@sakilacustomer.org | 49 |
| KELLY | TORRES | KELLY.TORRES@sakilacustomer.org | 47 |
| ELEANOR | HUNT | ELEANOR.HUNT@sakilacustomer.org | 46 |
| KARL | SEAL | KARL.SEAL@sakilacustomer.org | 45 |
The top 10 customers who have rented the most films are Marion Snyder, Jessie Banks, Tracy Cole, Leslie Gordon, Jamie Rice, Willie Howell, Terry Carlson, Kelly Torres, Eleanor Hunt, and Karl Seal.
SELECT title, description
FROM film LEFT JOIN inventory ON film.film_id = inventory.film_id LEFT JOIN rental ON inventory.inventory_id = rental.inventory_id
WHERE rental.rental_id IS NULL
| title | description |
|---|---|
| ACADEMY DINOSAUR | A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies |
| ALICE FANTASIA | A Emotional Drama of a A Shark And a Database Administrator who must Vanquish a Pioneer in Soviet Georgia |
| APOLLO TEEN | A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft |
| ARGONAUTS TOWN | A Emotional Epistle of a Forensic Psychologist And a Butler who must Challenge a Waitress in An Abandoned Mine Shaft |
| ARK RIDGEMONT | A Beautiful Yarn of a Pioneer And a Monkey who must Pursue a Explorer in The Sahara Desert |
| ARSENIC INDEPENDENCE | A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin |
| BOONDOCK BALLROOM | A Fateful Panorama of a Crocodile And a Boy who must Defeat a Monkey in The Gulf of Mexico |
| BUTCH PANTHER | A Lacklusture Yarn of a Feminist And a Database Administrator who must Face a Hunter in New Orleans |
| CATCH AMISTAD | A Boring Reflection of a Lumberjack And a Feminist who must Discover a Woman in Nigeria |
| CHINATOWN GLADIATOR | A Brilliant Panorama of a Technical Writer And a Lumberjack who must Escape a Butler in Ancient India |
There are 43 different moves that have never been rented.
SELECT category.name, AVG(length)
FROM film JOIN film_category USING (film_id) JOIN category USING (category_id)
GROUP BY category.name
ORDER BY AVG(length) DESC
| name | AVG(length) |
|---|---|
| Sports | 128.2027 |
| Games | 127.8361 |
| Foreign | 121.6986 |
| Drama | 120.8387 |
| Comedy | 115.8276 |
| Family | 114.7826 |
| Music | 113.6471 |
| Travel | 113.3158 |
| Horror | 112.4821 |
| Classics | 111.6667 |
The average length of a film by genre ranges from 111.67 minutes with Classics to around 128.20 minutes with Sports.
SELECT first_name, last_name, COUNT(*)
FROM film JOIN film_actor USING (film_id) JOIN actor USING (actor_id)
GROUP BY first_name
ORDER BY COUNT(*) DESC
LIMIT 5
| first_name | last_name | COUNT(*) |
|---|---|---|
| KENNETH | PESCI | 103 |
| PENELOPE | GUINESS | 102 |
| JAYNE | NEESON | 90 |
| MATTHEW | LEIGH | 89 |
| JULIA | MCQUEEN | 88 |
The 5 actors who have appeared in the most films are Kenneth Pesci, Penelope Guiness, Jayne Neeson, Matthew Leigh, and Julia McQueen.
SELECT customer.first_name, customer.last_name
FROM customer JOIN rental USING (customer_id) JOIN inventory USING (inventory_id) JOIN film_actor USING (film_id) JOIN actor USING (actor_id) WHERE actor.first_name = "JOHNNY" AND actor.last_name = "DEPP"
GROUP BY customer.first_name
ORDER BY COUNT(*) DESC
| first_name | last_name |
|---|
There appears to be no one in the data set to have ordered a Johnny Depp movie for their rentals.
SELECT title, SUM(film.rental_rate) AS tot_rev
FROM film JOIN inventory USING (film_id) JOIN rental USING (inventory_id)
GROUP BY title
ORDER BY tot_rev DESC
LIMIT 10
| title | tot_rev |
|---|---|
| BUCKET BROTHERHOOD | 169.66 |
| SCALAWAG DUCK | 159.68 |
| ZORRO ARK | 154.69 |
| WIFE TURN | 154.69 |
| GOODFELLAS SALUTE | 154.69 |
| APACHE DIVINE | 154.69 |
| WITCHES PANIC | 149.70 |
| MASSACRE USUAL | 149.70 |
| HARRY IDAHO | 149.70 |
| DOGMA FAMILY | 149.70 |
The top 10 films by revenue range from 149.70 with Dogma Family to 169.66 with Bucket Brotherhood.
plot_ly(diamonds,
x = ~carat,
y = ~price,
color = ~color,
type = "scatter",
text = ~cut,
hoverinfo = "text") %>%
layout(title = "Correlation betweent the carat weight of a diamond and its price",
xaxis = list(title = "Carat Weight"),
yaxis = list(title = "Price"))
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
The table shows the correlation between the Carat Weight of a diamond and its corresponding price. The point are also differentiated by color which denotes the color of the diamond, and additionally, if you hover over the points in the graphic, you can see the cut of the diamond.
The main difference between the graphs is the ability to interact. While the plotly includes the ability to incorporate interactive elements to the graphic, the ggplot doesn’t allow for this. However, the ggplot allows from better customization in terms of themes, scaling, etc. Additionally, the interactive nature of the plotly is only useful in non-print settings.
plot_ly(gapminder,
x = ~year,
y = ~pop,
color = ~continent,
type = "scatter",
mode = "lines+markers",
text = ~country,
hoverinfo = "text") %>%
layout(title = "Population Growth between 1952 and 2007 by Country",
xaxis = list(title = "Year"),
yaxis = list(title = "Population"))
This plotly shows the growth of populations over a 55 year span by country. The two variables being compared are the countries and the population. Additionally, the lines are color based on their continent. The implication of this graphic is that While many countries have reached a stabilizing period, countries such as India and China continue to grow at a remarkable pace. This could be due to a variety of genetic and geographic factor as well as some policy-based factors.