library(DBI)
## Warning: package 'DBI' was built under R version 4.3.3
library(RSQLite)
## Warning: package 'RSQLite' was built under R version 4.3.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
con <- dbConnect(RSQLite::SQLite(), ":memory:")

actor <- read.csv("/Users/rishithvuppala/Downloads/actor.csv")
category <- read.csv("/Users/rishithvuppala/Downloads/category.csv")
customer <- read.csv("/Users/rishithvuppala/Downloads/customer.csv")
film_actor <- read.csv("/Users/rishithvuppala/Downloads/film_actor.csv")
film_category <- read.csv("/Users/rishithvuppala/Downloads/film_category.csv")
film <- read.csv("/Users/rishithvuppala/Downloads/film.csv")
inventory <- read.csv("/Users/rishithvuppala/Downloads/inventory.csv")
payment <- read.csv("/Users/rishithvuppala/Downloads/payment.csv")
rental <- read.csv("/Users/rishithvuppala/Downloads/rental.csv")

dbWriteTable(con, "actor", actor, overwrite = TRUE)
dbWriteTable(con, "category", category, overwrite = TRUE)
dbWriteTable(con, "customer", customer, overwrite = TRUE)
dbWriteTable(con, "film_actor", film_actor, overwrite = TRUE)
dbWriteTable(con, "film_category", film_category, overwrite = TRUE)
dbWriteTable(con, "film", film, overwrite = TRUE)
dbWriteTable(con, "inventory", inventory, overwrite = TRUE)
dbWriteTable(con, "payment", payment, overwrite = TRUE)
dbWriteTable(con, "rental", rental, overwrite = TRUE)

dbListTables(con)
## [1] "actor"         "category"      "customer"      "film"         
## [5] "film_actor"    "film_category" "inventory"     "payment"      
## [9] "rental"

SQL

Problem 1

SELECT 
    c.first_name,
    c.last_name,
    c.email,
    COUNT(r.rental_id) AS total_rentals
FROM 
    customer c
JOIN 
    rental r ON c.customer_id = r.customer_id
GROUP BY 
    c.customer_id
ORDER BY 
    total_rentals DESC
LIMIT 10;
Displaying records 1 - 10
first_name last_name email total_rentals
ELEANOR HUNT 46
KARL SEAL 45
MARCIA DEAN 42
CLARA SHAW 42
TAMMY SANDERS 41
WESLEY BULL 40
SUE PETERS 40
TIM CARY 39
MARION SNYDER 39
RHONDA KENNEDY 39

Problem 2

SELECT 
    f.title,
    f.description
FROM 
    film f
LEFT JOIN 
    inventory i ON f.film_id = i.film_id
LEFT JOIN 
    rental r ON i.inventory_id = r.inventory_id
WHERE 
    r.rental_id IS NULL
ORDER BY 
    f.title ASC;
Displaying records 1 - 10
title description
ACADEMY DINOSAUR A Epic Drama of a Feminist And a Mad Scientist who must Battle a Teacher in The Canadian Rockies
ALICE FANTASIA A Emotional Drama of a A Shark And a Database Administrator who must Vanquish a Pioneer in Soviet Georgia
APOLLO TEEN A Action-Packed Reflection of a Crocodile And a Explorer who must Find a Sumo Wrestler in An Abandoned Mine Shaft
ARGONAUTS TOWN A Emotional Epistle of a Forensic Psychologist And a Butler who must Challenge a Waitress in An Abandoned Mine Shaft
ARK RIDGEMONT A Beautiful Yarn of a Pioneer And a Monkey who must Pursue a Explorer in The Sahara Desert
ARSENIC INDEPENDENCE A Fanciful Documentary of a Mad Cow And a Womanizer who must Find a Dentist in Berlin
BOONDOCK BALLROOM A Fateful Panorama of a Crocodile And a Boy who must Defeat a Monkey in The Gulf of Mexico
BUTCH PANTHER A Lacklusture Yarn of a Feminist And a Database Administrator who must Face a Hunter in New Orleans
CATCH AMISTAD A Boring Reflection of a Lumberjack And a Feminist who must Discover a Woman in Nigeria
CHINATOWN GLADIATOR A Brilliant Panorama of a Technical Writer And a Lumberjack who must Escape a Butler in Ancient India

Problem 3

SELECT 
    cat.name AS category_name,
    AVG(f.length) AS average_length
FROM 
    category cat
JOIN 
    film_category fc ON cat.category_id = fc.category_id
JOIN 
    film f ON fc.film_id = f.film_id
GROUP BY 
    cat.category_id
ORDER BY 
    average_length DESC;
Displaying records 1 - 10
category_name average_length
Sports 128.2027
Games 127.8361
Foreign 121.6986
Drama 120.8387
Comedy 115.8276
Family 114.7826
Music 113.6471
Travel 113.3158
Horror 112.4821
Classics 111.6667

Problem 4

SELECT 
    a.first_name,
    a.last_name,
    COUNT(fa.film_id) AS film_count
FROM 
    actor a
JOIN 
    film_actor fa ON a.actor_id = fa.actor_id
GROUP BY 
    a.actor_id
ORDER BY 
    film_count DESC
LIMIT 5;
5 records
first_name last_name film_count
GINA DEGENERES 42
WALTER TORN 41
MARY KEITEL 40
MATTHEW CARREY 39
SANDRA KILMER 37

Problem 5

SELECT DISTINCT 
    c.first_name,
    c.last_name
FROM 
    customer c
JOIN 
    rental r ON c.customer_id = r.customer_id
JOIN 
    inventory i ON r.inventory_id = i.inventory_id
JOIN 
    film_actor fa ON i.film_id = fa.film_id
JOIN 
    actor a ON fa.actor_id = a.actor_id
WHERE 
    a.first_name = 'Johnny' 
    AND a.last_name = 'Depp';
0 records
first_name last_name

No customers have rented a film starring Johnny Depp.

Problem 6

SELECT 
    f.title,
    SUM(p.amount) AS total_revenue
FROM 
    film f
JOIN 
    inventory i ON f.film_id = i.film_id
JOIN 
    rental r ON i.inventory_id = r.inventory_id
JOIN 
    payment p ON r.rental_id = p.rental_id
GROUP BY 
    f.film_id
ORDER BY 
    total_revenue DESC
LIMIT 10;
Displaying records 1 - 10
title total_revenue
TELEGRAPH VOYAGE 231.73
WIFE TURN 223.69
ZORRO ARK 214.69
GOODFELLAS SALUTE 209.69
SATURDAY LAMBS 204.72
TITANS JERK 201.71
TORQUE BOUND 198.72
HARRY IDAHO 195.70
INNOCENT USUAL 191.74
HUSTLER PARTY 190.78

Plotly

Problem 1

data("diamonds")

p <- ggplot(diamonds, aes(
    x = depth,
    y = price,
    color = clarity,
    text = paste(
        "Clarity: ", clarity,
        "<br>Cut: ", cut,
        "<br>Color: ", color,
        "<br>Price: $", format(price, big.mark = ",")
    )
)) +
    geom_point(alpha = 0.5) +
    labs(
        title = "Diamond Price vs. Depth",
        subtitle = "Exploring how the depth percentage affects diamond price",
        x = "Depth (%)",
        y = "Price (USD)"
    ) +
    theme_minimal()

plotly_plot <- ggplotly(p, tooltip = "text")

plotly_plot

Problem 2

ggplot2 and Plotly are both tools used for data visualization, but serve different goals. ggplot2 is a library in R that is static and allows users to build layered plots (different types of them as well) with plenty of cutomizable options. The plots and visualizations made using ggplot2 are primarily used for data exploration and statistical analysis. Personally, I’ve used it when doing research in labs as the plots made here opposed to other services such as Word can be customized and fit for academic and research papers.

Plotly is an interactive plotting library that has interactive features such as zooming, panning, and hover information. In this way it explores data, but allows user to actively engage in doing so. It also has animations that allows to show visualizations over time. As such, I haven’t really seen this being used in academic/research settings. However, the interactive nature of Plotly still allows for EDA and can be used in web apps and dashboards. Customizing plots in Plotly is more difficult than ggplot2 and while making a plot with the diamonds dataset, for example, I noticed that it often takes longer to develop. With this being said, you would use ggplot2 over Plotly when creating static charts used in scientific journals (i.e Nature, ACS, etc.). Plotly can be used for interactive presentations or can be used for business analytics when showcasing data to clients and allowing them to interact with the data as it’s engaging. This provides a deeper user experience opposed to traditional plots made in ggplot2, where data is often supplemented with tables or explanations.

Problem 3

library(gapminder)

data("gapminder")
gapminder_filtered <- gapminder %>%
    filter(year >= 1952)

plotly_plot <- gapminder_filtered %>%
    plot_ly(
        x = ~gdpPercap,
        y = ~lifeExp,
        color = ~continent,
        size = ~pop,
        frame = ~year,
        text = ~paste(
            "Country: ", country,
            "<br>Year: ", year,
            "<br>Life Expectancy: ", lifeExp,
            "<br>GDP per Capita ($): ", format(round(gdpPercap, 3), big.mark = ","),
            "<br>Population: ", format(pop, big.mark = ",")
        ),
        hoverinfo = "text",
        type = 'scatter',
        mode = 'markers',
        sizes = c(10, 50),
        marker = list(opacity = 0.6, sizemode = 'area')
    ) %>%
    layout(
        title = "Life Expectancy vs. GDP per Capita Over Time",
        xaxis = list(title = "GDP per Capita ($)", type = "log"),
        yaxis = list(title = "Life Expectancy (years)")
    ) %>%
    animation_opts(
        frame = 1000,
        transition = 0,
        redraw = FALSE
    )

plotly_plot
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.

This interactive plot created with Plotly and using the Gapminder dataset visualizes how countries’ Life expectancy and GDP per Capita changed over time. It reveals how over time as countries’ prospered economically, the age of their populations increased as well. Life Expectancy is on the y-axis and GDP per Capita is on the x-axis, and the slider controls the time (year). It suggests that economic growth is often consistent with life expectancy, especially when this is appropriated to the necessary services being healthcare, sanitation, education, etc.