QUESTION 1
data(cars)
median(cars$speed)
## [1] 15
QUESTION 2
# Install once if needed:
# install.packages("jsonlite")
# install.packages("dplyr")
library(jsonlite)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 100 most recent days = limit 99 (endpoint returns limit + 1 rows)
url <- "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=99"
j <- fromJSON(url)
# Data lives in $Data$Data
btc <- j$Data$Data
# Max daily close price
max_close <- max(btc$close, na.rm = TRUE)
max_close
## [1] 124723
# Install needed packages (if not yet installed)
# install.packages(c("jsonlite", "dplyr", "ggplot2", "lubridate"))
library(jsonlite)
library(dplyr)
library(ggplot2)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# API endpoint for BTC-USD, last 100 days
url <- "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=99"
btc_data <- fromJSON(url)$Data$Data
# Convert UNIX time to readable date
btc_data <- btc_data %>%
mutate(date = as_datetime(time)) %>%
select(date, open, high, low, close, volumefrom, volumeto)
# Check structure
str(btc_data)
## 'data.frame': 100 obs. of 7 variables:
## $ date : POSIXct, format: "2025-08-10" "2025-08-11" ...
## $ open : num 116501 119312 118715 120128 123375 ...
## $ high : num 119321 122310 120326 123735 124533 ...
## $ low : num 116496 118107 118214 118948 117241 ...
## $ close : num 119312 118715 120128 123375 118392 ...
## $ volumefrom: num 10554 25660 18214 27353 35492 ...
## $ volumeto : num 1.25e+09 3.09e+09 2.17e+09 3.32e+09 4.25e+09 ...
# Remove missing values
btc_data <- na.omit(btc_data)
# Basic summary
summary(btc_data)
## date open high
## Min. :2025-08-10 00:00:00 Min. : 94244 Min. : 95973
## 1st Qu.:2025-09-03 18:00:00 1st Qu.:109207 1st Qu.:110998
## Median :2025-09-28 12:00:00 Median :112158 Median :113659
## Mean :2025-09-28 12:00:00 Mean :112129 Mean :113651
## 3rd Qu.:2025-10-23 06:00:00 3rd Qu.:115810 3rd Qu.:116863
## Max. :2025-11-17 00:00:00 Max. :124723 Max. :126287
## low close volumefrom volumeto
## Min. : 92149 Min. : 92155 Min. : 5423 Min. :6.268e+08
## 1st Qu.:107501 1st Qu.:109004 1st Qu.:16986 1st Qu.:1.896e+09
## Median :110469 Median :112066 Median :22676 Median :2.478e+09
## Mean :110270 Mean :111886 Mean :23645 Mean :2.628e+09
## 3rd Qu.:114723 3rd Qu.:115715 3rd Qu.:28542 3rd Qu.:3.310e+09
## Max. :123143 Max. :124723 Max. :64052 Max. :7.412e+09
# Create new metrics
btc_data <- btc_data %>%
mutate(daily_return = (close - lag(close)) / lag(close) * 100)
# Visualize
ggplot(btc_data, aes(x = date, y = close)) +
geom_line() +
labs(title = "Bitcoin Daily Closing Price (Last 100 Days)",
x = "Date", y = "Close Price (USD)")
Sys.setenv(COLLEGE_SCORECARD_API_KEY = "")
Sys.getenv("COLLEGE_SCORECARD_API_KEY")
## [1] ""
# Research Questions (15 pts)
# RQ1: Which countries rank highest/lowest in beer, wine, and spirits servings per capita?
# RQ2: Which continents show the strongest preference for beer vs. wine vs. spirits?
# RQ3: How concentrated is a country’s drinking into one beverage vs. balanced across types?
# RQ4: How does total litres of pure alcohol relate to beverage mix?
# RQ5: Are top wine countries also high in total alcohol?
# ======================
# Setup
# ======================
req_pkgs <- c("tidyverse","janitor","countrycode","skimr","readr","glue")
to_install <- setdiff(req_pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(req_pkgs, library, character.only = TRUE))
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ purrr 1.1.0 ✔ tibble 3.3.0
## ✔ readr 2.1.5 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'janitor'
##
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
set.seed(4315)
article_url <- "https://fivethirtyeight.com/features/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/"
data_url <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/alcohol-consumption/drinks.csv"
raw_drinks <- readr::read_csv(data_url, show_col_types = FALSE)
# Quick structure and summary
glimpse(raw_drinks)
## Rows: 193
## Columns: 5
## $ country <chr> "Afghanistan", "Albania", "Algeria", "And…
## $ beer_servings <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, 2…
## $ spirit_servings <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75,…
## $ wine_servings <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 191…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8, …
skimr::skim(raw_drinks)
| Name | raw_drinks |
| Number of rows | 193 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 0 | 1 | 3 | 28 | 0 | 193 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| beer_servings | 0 | 1 | 106.16 | 101.14 | 0 | 20.0 | 76.0 | 188.0 | 376.0 | ▇▃▂▂▁ |
| spirit_servings | 0 | 1 | 80.99 | 88.28 | 0 | 4.0 | 56.0 | 128.0 | 438.0 | ▇▃▂▁▁ |
| wine_servings | 0 | 1 | 49.45 | 79.70 | 0 | 1.0 | 8.0 | 59.0 | 370.0 | ▇▁▁▁▁ |
| total_litres_of_pure_alcohol | 0 | 1 | 4.72 | 3.77 | 0 | 1.3 | 4.2 | 7.2 | 14.4 | ▇▃▅▃▁ |
# 0) Column expectations from the FiveThirtyEight CSV
expected_cols <- c("country","beer_servings","spirit_servings",
"wine_servings","total_litres_of_pure_alcohol")
missing_now <- setdiff(expected_cols, names(raw_drinks))
if (length(missing_now)) {
stop(glue("Missing expected columns in drinks.csv: {paste(missing_now, collapse=', ')}"))
}
# 1) Standardize names, trim country
drinks <- raw_drinks |>
janitor::clean_names() |>
mutate(country = stringr::str_squish(country))
# 2) Ensure/repair 'continent'
has_continent <- "continent" %in% names(drinks)
if (has_continent) {
drinks <- drinks |>
mutate(
continent = dplyr::na_if(stringr::str_squish(continent), ""),
continent_cc = countrycode(country, "country.name", "continent"),
continent = coalesce(continent, continent_cc)
) |>
select(-continent_cc)
} else {
drinks <- drinks |>
mutate(continent = countrycode(country, "country.name", "continent"))
}
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `continent = countrycode(country, "country.name", "continent")`.
## Caused by warning:
## ! Some values were not matched unambiguously: Micronesia
# 3) Numeric hygiene: cast + guard against negatives
num_cols <- c("beer_servings","spirit_servings","wine_servings","total_litres_of_pure_alcohol")
drinks <- drinks |>
mutate(across(all_of(num_cols), ~ if_else(as.numeric(.x) < 0, NA_real_, as.numeric(.x))))
# 4) Mix shares & balance index
drinks <- drinks |>
mutate(
total_servings = beer_servings + spirit_servings + wine_servings,
beer_share = if_else(total_servings > 0, beer_servings / total_servings, NA_real_),
wine_share = if_else(total_servings > 0, wine_servings / total_servings, NA_real_),
spirits_share = if_else(total_servings > 0, spirit_servings / total_servings, NA_real_),
hhi = beer_share^2 + wine_share^2 + spirits_share^2,
# scale (1 - HHI) to [0,1], where 0 = single-beverage concentrated, 1 = evenly balanced
mix_balance = if_else(!is.na(hhi), (1 - hhi) / (1 - 1/3), NA_real_)
)
# 5) Long format for beverage-specific work
drinks_long <- drinks |>
pivot_longer(
cols = c(beer_servings, wine_servings, spirit_servings),
names_to = "beverage",
values_to = "servings"
) |>
mutate(beverage = recode(beverage,
beer_servings = "Beer",
wine_servings = "Wine",
spirit_servings = "Spirits"))
# 6) Country rankings by beverage
rankings <- drinks_long |>
group_by(beverage) |>
arrange(desc(servings), .by_group = TRUE) |>
mutate(rank_within_beverage = row_number()) |>
ungroup()
# 7) Overall alcohol ranking
total_rank <- drinks |>
arrange(desc(total_litres_of_pure_alcohol)) |>
mutate(total_alcohol_rank = row_number()) |>
select(country, continent, total_litres_of_pure_alcohol, total_alcohol_rank)
rankings_with_total <- rankings |>
left_join(total_rank, by = "country")
# 8) Continent profiles
continent_profile <- drinks_long |>
group_by(continent, beverage) |>
summarise(
mean_servings = mean(servings, na.rm = TRUE),
median_servings = median(servings, na.rm = TRUE),
.groups = "drop"
)
continent_share_profile <- drinks |>
group_by(continent) |>
summarise(
mean_beer_share = mean(beer_share, na.rm = TRUE),
mean_wine_share = mean(wine_share, na.rm = TRUE),
mean_spirits_share = mean(spirits_share, na.rm = TRUE),
countries_n = n(),
.groups = "drop"
)
# Top/Bottom 10 by beverage
top10_each <- rankings |>
group_by(beverage) |>
slice_head(n = 10) |>
select(beverage, rank_within_beverage, country, continent, servings)
bottom10_each <- rankings |>
group_by(beverage) |>
slice_tail(n = 10) |>
arrange(beverage, rank_within_beverage)
# Save clean outputs for future analysis/visualization
write_csv(drinks, "drinks_clean.csv")
write_csv(drinks_long, "drinks_long.csv")
write_csv(rankings_with_total, "drinks_rankings.csv")
write_csv(continent_profile, "continent_profile.csv")
write_csv(continent_share_profile, "continent_share_profile.csv")
# Show key tables in the knitted doc
top10_each
## # A tibble: 30 × 5
## # Groups: beverage [3]
## beverage rank_within_beverage country continent servings
## <chr> <int> <chr> <chr> <dbl>
## 1 Beer 1 Namibia Africa 376
## 2 Beer 2 Czech Republic Europe 361
## 3 Beer 3 Gabon Africa 347
## 4 Beer 4 Germany Europe 346
## 5 Beer 5 Lithuania Europe 343
## 6 Beer 6 Poland Europe 343
## 7 Beer 7 Venezuela Americas 333
## 8 Beer 8 Ireland Europe 313
## 9 Beer 9 Palau Oceania 306
## 10 Beer 10 Romania Europe 297
## # ℹ 20 more rows
continent_share_profile
## # A tibble: 6 × 5
## continent mean_beer_share mean_wine_share mean_spirits_share countries_n
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Africa 0.675 0.138 0.188 53
## 2 Americas 0.463 0.0948 0.442 35
## 3 Asia 0.407 0.0993 0.494 47
## 4 Europe 0.413 0.299 0.288 42
## 5 Oceania 0.519 0.135 0.346 15
## 6 <NA> 0.477 0.138 0.385 1
# Country mix balance: top 12 most balanced
drinks |>
arrange(desc(mix_balance)) |>
slice_head(n = 12) |>
ggplot(aes(x = reorder(country, mix_balance), y = mix_balance)) +
geom_col() +
coord_flip() +
labs(title = "Most Balanced Beverage Mix (Top 12)",
x = NULL, y = "Balance Index (0=concentrated, 1=balanced)")
# Mean servings by continent & beverage
continent_profile |>
ggplot(aes(x = continent, y = mean_servings, fill = beverage)) +
geom_col(position = "dodge") +
labs(title = "Mean Servings per Capita by Continent & Beverage",
x = "Continent", y = "Mean Servings")
Cleaning steps: • Standardized variable names and trimmed country names • Repaired/created continent using countrycode (robust if the CSV lacks a ‘continent’ column) • Cast numeric columns and guarded against negative values • Computed beverage shares and a balance index (scaled 0–1) • Produced long format for beverage-specific analysis • Built country and continent summaries/rankings
Use: • ‘drinks_long’ for rank plots and beverage-specific visuals • ‘continent_profile’ & ‘continent_share_profile’ for regional comparisons • ‘rankings_with_total’ to relate total alcohol to beverage preference
# cat("
# Data Source (extraction): FiveThirtyEight “Dear Mona” article:
# ", article_url, "
# Raw data: ", data_url ")