library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.3.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
con <- dbConnect(RSQLite::SQLite(), ":memory:")
actor <- read.csv("/Users/rishithvuppala/Downloads/actor.csv")
category <- read.csv("/Users/rishithvuppala/Downloads/category.csv")
customer <- read.csv("/Users/rishithvuppala/Downloads/customer.csv")
film_actor <- read.csv("/Users/rishithvuppala/Downloads/film_actor.csv")
film_category <- read.csv("/Users/rishithvuppala/Downloads/film_category.csv")
film <- read.csv("/Users/rishithvuppala/Downloads/film.csv")
inventory <- read.csv("/Users/rishithvuppala/Downloads/inventory.csv")
payment <- read.csv("/Users/rishithvuppala/Downloads/payment.csv")
rental <- read.csv("/Users/rishithvuppala/Downloads/rental.csv")
dbWriteTable(con, "actor", actor, overwrite = TRUE)
dbWriteTable(con, "category", category, overwrite = TRUE)
dbWriteTable(con, "customer", customer, overwrite = TRUE)
dbWriteTable(con, "film_actor", film_actor, overwrite = TRUE)
dbWriteTable(con, "film_category", film_category, overwrite = TRUE)
dbWriteTable(con, "film", film, overwrite = TRUE)
dbWriteTable(con, "inventory", inventory, overwrite = TRUE)
dbWriteTable(con, "payment", payment, overwrite = TRUE)
dbWriteTable(con, "rental", rental, overwrite = TRUE)
dbListTables(con)
## [1] "actor" "category" "customer" "film"
## [5] "film_actor" "film_category" "inventory" "payment"
## [9] "rental"
SQL
Problem 1
SELECT
c.first_name,
c.last_name,
c.email,
COUNT(r.rental_id) AS total_rentals
FROM
customer c
JOIN
rental r ON c.customer_id = r.customer_id
GROUP BY
c.customer_id
ORDER BY
total_rentals DESC
LIMIT 10;
Problem 2
SELECT
f.title,
f.description
FROM
film f
LEFT JOIN
inventory i ON f.film_id = i.film_id
LEFT JOIN
rental r ON i.inventory_id = r.inventory_id
WHERE
r.rental_id IS NULL
ORDER BY
f.title ASC;
Displaying records 1 - 10
| ACADEMY DINOSAUR |
A Epic Drama of a Feminist And a Mad Scientist who must
Battle a Teacher in The Canadian Rockies |
| ALICE FANTASIA |
A Emotional Drama of a A Shark And a Database
Administrator who must Vanquish a Pioneer in Soviet Georgia |
| APOLLO TEEN |
A Action-Packed Reflection of a Crocodile And a
Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft |
| ARGONAUTS TOWN |
A Emotional Epistle of a Forensic Psychologist And a
Butler who must Challenge a Waitress in An Abandoned Mine Shaft |
| ARK RIDGEMONT |
A Beautiful Yarn of a Pioneer And a Monkey who must
Pursue a Explorer in The Sahara Desert |
| ARSENIC INDEPENDENCE |
A Fanciful Documentary of a Mad Cow And a Womanizer who
must Find a Dentist in Berlin |
| BOONDOCK BALLROOM |
A Fateful Panorama of a Crocodile And a Boy who must
Defeat a Monkey in The Gulf of Mexico |
| BUTCH PANTHER |
A Lacklusture Yarn of a Feminist And a Database
Administrator who must Face a Hunter in New Orleans |
| CATCH AMISTAD |
A Boring Reflection of a Lumberjack And a Feminist who
must Discover a Woman in Nigeria |
| CHINATOWN GLADIATOR |
A Brilliant Panorama of a Technical Writer And a
Lumberjack who must Escape a Butler in Ancient India |
Problem 3
SELECT
cat.name AS category_name,
AVG(f.length) AS average_length
FROM
category cat
JOIN
film_category fc ON cat.category_id = fc.category_id
JOIN
film f ON fc.film_id = f.film_id
GROUP BY
cat.category_id
ORDER BY
average_length DESC;
Displaying records 1 - 10
| Sports |
128.2027 |
| Games |
127.8361 |
| Foreign |
121.6986 |
| Drama |
120.8387 |
| Comedy |
115.8276 |
| Family |
114.7826 |
| Music |
113.6471 |
| Travel |
113.3158 |
| Horror |
112.4821 |
| Classics |
111.6667 |
Problem 4
SELECT
a.first_name,
a.last_name,
COUNT(fa.film_id) AS film_count
FROM
actor a
JOIN
film_actor fa ON a.actor_id = fa.actor_id
GROUP BY
a.actor_id
ORDER BY
film_count DESC
LIMIT 5;
5 records
| GINA |
DEGENERES |
42 |
| WALTER |
TORN |
41 |
| MARY |
KEITEL |
40 |
| MATTHEW |
CARREY |
39 |
| SANDRA |
KILMER |
37 |
Problem 5
SELECT DISTINCT
c.first_name,
c.last_name
FROM
customer c
JOIN
rental r ON c.customer_id = r.customer_id
JOIN
inventory i ON r.inventory_id = i.inventory_id
JOIN
film_actor fa ON i.film_id = fa.film_id
JOIN
actor a ON fa.actor_id = a.actor_id
WHERE
a.first_name = 'Johnny'
AND a.last_name = 'Depp';
No customers have rented a film starring Johnny Depp.
Problem 6
SELECT
f.title,
SUM(p.amount) AS total_revenue
FROM
film f
JOIN
inventory i ON f.film_id = i.film_id
JOIN
rental r ON i.inventory_id = r.inventory_id
JOIN
payment p ON r.rental_id = p.rental_id
GROUP BY
f.film_id
ORDER BY
total_revenue DESC
LIMIT 10;
Displaying records 1 - 10
| TELEGRAPH VOYAGE |
231.73 |
| WIFE TURN |
223.69 |
| ZORRO ARK |
214.69 |
| GOODFELLAS SALUTE |
209.69 |
| SATURDAY LAMBS |
204.72 |
| TITANS JERK |
201.71 |
| TORQUE BOUND |
198.72 |
| HARRY IDAHO |
195.70 |
| INNOCENT USUAL |
191.74 |
| HUSTLER PARTY |
190.78 |
Plotly
Problem 1
data("diamonds")
p <- ggplot(diamonds, aes(
x = depth,
y = price,
color = clarity,
text = paste(
"Clarity: ", clarity,
"<br>Cut: ", cut,
"<br>Color: ", color,
"<br>Price: $", format(price, big.mark = ",")
)
)) +
geom_point(alpha = 0.5) +
labs(
title = "Diamond Price vs. Depth",
subtitle = "Exploring how the depth percentage affects diamond price",
x = "Depth (%)",
y = "Price (USD)"
) +
theme_minimal()
plotly_plot <- ggplotly(p, tooltip = "text")
plotly_plot
Problem 2
ggplot2 and Plotly are both tools used for data visualization, but
serve different goals. ggplot2 is a library in R that is static and
allows users to build layered plots (different types of them as well)
with plenty of cutomizable options. The plots and visualizations made
using ggplot2 are primarily used for data exploration and statistical
analysis. Personally, I’ve used it when doing research in labs as the
plots made here opposed to other services such as Word can be customized
and fit for academic and research papers.
Plotly is an interactive plotting library that has interactive
features such as zooming, panning, and hover information. In this way it
explores data, but allows user to actively engage in doing so. It also
has animations that allows to show visualizations over time. As such, I
haven’t really seen this being used in academic/research settings.
However, the interactive nature of Plotly still allows for EDA and can
be used in web apps and dashboards. Customizing plots in Plotly is more
difficult than ggplot2 and while making a plot with the diamonds
dataset, for example, I noticed that it often takes longer to develop.
With this being said, you would use ggplot2 over Plotly when creating
static charts used in scientific journals (i.e Nature, ACS, etc.).
Plotly can be used for interactive presentations or can be used for
business analytics when showcasing data to clients and allowing them to
interact with the data as it’s engaging. This provides a deeper user
experience opposed to traditional plots made in ggplot2, where data is
often supplemented with tables or explanations.
Problem 3
library(gapminder)
data("gapminder")
gapminder_filtered <- gapminder %>%
filter(year >= 1952)
plotly_plot <- gapminder_filtered %>%
plot_ly(
x = ~gdpPercap,
y = ~lifeExp,
color = ~continent,
size = ~pop,
frame = ~year,
text = ~paste(
"Country: ", country,
"<br>Year: ", year,
"<br>Life Expectancy: ", lifeExp,
"<br>GDP per Capita ($): ", format(round(gdpPercap, 3), big.mark = ","),
"<br>Population: ", format(pop, big.mark = ",")
),
hoverinfo = "text",
type = 'scatter',
mode = 'markers',
sizes = c(10, 50),
marker = list(opacity = 0.6, sizemode = 'area')
) %>%
layout(
title = "Life Expectancy vs. GDP per Capita Over Time",
xaxis = list(title = "GDP per Capita ($)", type = "log"),
yaxis = list(title = "Life Expectancy (years)")
) %>%
animation_opts(
frame = 1000,
transition = 0,
redraw = FALSE
)
plotly_plot
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
This interactive plot created with Plotly and using the Gapminder
dataset visualizes how countries’ Life expectancy and GDP per Capita
changed over time. It reveals how over time as countries’ prospered
economically, the age of their populations increased as well. Life
Expectancy is on the y-axis and GDP per Capita is on the x-axis, and the
slider controls the time (year). It suggests that economic growth is
often consistent with life expectancy, especially when this is
appropriated to the necessary services being healthcare, sanitation,
education, etc.