Mini Project 1

QUESTION 1

data(cars)
median(cars$speed)

## [1] 15

QUESTION 2

# Install once if needed: 
# install.packages("jsonlite")
# install.packages("dplyr") 

library(jsonlite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# 100 most recent days = limit 99 (endpoint returns limit + 1 rows)
url <- "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=99"

j <- fromJSON(url)

# Data lives in $Data$Data
btc <- j$Data$Data

# Max daily close price
max_close <- max(btc$close, na.rm = TRUE)
max_close

## [1] 124723

# Install needed packages (if not yet installed)
# install.packages(c("jsonlite", "dplyr", "ggplot2", "lubridate"))

library(jsonlite)
library(dplyr)
library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

# API endpoint for BTC-USD, last 100 days
url <- "https://min-api.cryptocompare.com/data/v2/histoday?fsym=BTC&tsym=USD&limit=99"

btc_data <- fromJSON(url)$Data$Data

# Convert UNIX time to readable date
btc_data <- btc_data %>%
  mutate(date = as_datetime(time)) %>%
  select(date, open, high, low, close, volumefrom, volumeto)

# Check structure
str(btc_data)

## 'data.frame':    100 obs. of  7 variables:
##  $ date      : POSIXct, format: "2025-08-10" "2025-08-11" ...
##  $ open      : num  116501 119312 118715 120128 123375 ...
##  $ high      : num  119321 122310 120326 123735 124533 ...
##  $ low       : num  116496 118107 118214 118948 117241 ...
##  $ close     : num  119312 118715 120128 123375 118392 ...
##  $ volumefrom: num  10554 25660 18214 27353 35492 ...
##  $ volumeto  : num  1.25e+09 3.09e+09 2.17e+09 3.32e+09 4.25e+09 ...

# Remove missing values
btc_data <- na.omit(btc_data)

# Basic summary
summary(btc_data)

##       date                          open             high       
##  Min.   :2025-08-10 00:00:00   Min.   : 94244   Min.   : 95973  
##  1st Qu.:2025-09-03 18:00:00   1st Qu.:109207   1st Qu.:110998  
##  Median :2025-09-28 12:00:00   Median :112158   Median :113659  
##  Mean   :2025-09-28 12:00:00   Mean   :112129   Mean   :113651  
##  3rd Qu.:2025-10-23 06:00:00   3rd Qu.:115810   3rd Qu.:116863  
##  Max.   :2025-11-17 00:00:00   Max.   :124723   Max.   :126287  
##       low             close          volumefrom       volumeto        
##  Min.   : 92149   Min.   : 92155   Min.   : 5423   Min.   :6.268e+08  
##  1st Qu.:107501   1st Qu.:109004   1st Qu.:16986   1st Qu.:1.896e+09  
##  Median :110469   Median :112066   Median :22676   Median :2.478e+09  
##  Mean   :110270   Mean   :111886   Mean   :23645   Mean   :2.628e+09  
##  3rd Qu.:114723   3rd Qu.:115715   3rd Qu.:28542   3rd Qu.:3.310e+09  
##  Max.   :123143   Max.   :124723   Max.   :64052   Max.   :7.412e+09

# Create new metrics
btc_data <- btc_data %>%
  mutate(daily_return = (close - lag(close)) / lag(close) * 100)

# Visualize
ggplot(btc_data, aes(x = date, y = close)) +
  geom_line() +
  labs(title = "Bitcoin Daily Closing Price (Last 100 Days)",
       x = "Date", y = "Close Price (USD)")

Sys.setenv(COLLEGE_SCORECARD_API_KEY = "")


Sys.getenv("COLLEGE_SCORECARD_API_KEY")

## [1] ""

# Research Questions (15 pts)
# RQ1: Which countries rank highest/lowest in beer, wine, and spirits servings per capita?
# RQ2: Which continents show the strongest preference for beer vs. wine vs. spirits?
# RQ3: How concentrated is a country’s drinking into one beverage vs. balanced across types?
# RQ4: How does total litres of pure alcohol relate to beverage mix?
# RQ5: Are top wine countries also high in total alcohol?

# ======================
# Setup
# ======================
req_pkgs <- c("tidyverse","janitor","countrycode","skimr","readr","glue")
to_install <- setdiff(req_pkgs, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install, dependencies = TRUE)
invisible(lapply(req_pkgs, library, character.only = TRUE))

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.2
## ✔ purrr   1.1.0     ✔ tibble  3.3.0
## ✔ readr   2.1.5     ✔ tidyr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()  masks stats::filter()
## ✖ purrr::flatten() masks jsonlite::flatten()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## Attaching package: 'janitor'
## 
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

set.seed(4315)

article_url <- "https://fivethirtyeight.com/features/dear-mona-followup-where-do-people-drink-the-most-beer-wine-and-spirits/"
data_url    <- "https://raw.githubusercontent.com/fivethirtyeight/data/master/alcohol-consumption/drinks.csv"

raw_drinks <- readr::read_csv(data_url, show_col_types = FALSE)

# Quick structure and summary
glimpse(raw_drinks)

## Rows: 193
## Columns: 5
## $ country                      <chr> "Afghanistan", "Albania", "Algeria", "And…
## $ beer_servings                <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, 2…
## $ spirit_servings              <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75,…
## $ wine_servings                <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 191…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8, …

skimr::skim(raw_drinks)

Data summary
Name	raw_drinks
Number of rows	193
Number of columns	5
_______________________
Column type frequency:
character	1
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
country	0	1	3	28	0	193	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p25	p50	p75	p100	hist
beer_servings	1	106.16	101.14	20.0	76.0	188.0	376.0	▇▃▂▂▁
spirit_servings	1	80.99	88.28	4.0	56.0	128.0	438.0	▇▃▂▁▁
wine_servings	1	49.45	79.70	1.0	8.0	59.0	370.0	▇▁▁▁▁
total_litres_of_pure_alcohol	1	4.72	3.77	1.3	4.2	7.2	14.4	▇▃▅▃▁

# 0) Column expectations from the FiveThirtyEight CSV
expected_cols <- c("country","beer_servings","spirit_servings",
                   "wine_servings","total_litres_of_pure_alcohol")
missing_now <- setdiff(expected_cols, names(raw_drinks))
if (length(missing_now)) {
  stop(glue("Missing expected columns in drinks.csv: {paste(missing_now, collapse=', ')}"))
}

# 1) Standardize names, trim country
drinks <- raw_drinks |>
  janitor::clean_names() |>
  mutate(country = stringr::str_squish(country))

# 2) Ensure/repair 'continent'
has_continent <- "continent" %in% names(drinks)
if (has_continent) {
  drinks <- drinks |>
    mutate(
      continent = dplyr::na_if(stringr::str_squish(continent), ""),
      continent_cc = countrycode(country, "country.name", "continent"),
      continent = coalesce(continent, continent_cc)
    ) |>
    select(-continent_cc)
} else {
  drinks <- drinks |>
    mutate(continent = countrycode(country, "country.name", "continent"))
}

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `continent = countrycode(country, "country.name", "continent")`.
## Caused by warning:
## ! Some values were not matched unambiguously: Micronesia

# 3) Numeric hygiene: cast + guard against negatives
num_cols <- c("beer_servings","spirit_servings","wine_servings","total_litres_of_pure_alcohol")
drinks <- drinks |>
  mutate(across(all_of(num_cols), ~ if_else(as.numeric(.x) < 0, NA_real_, as.numeric(.x))))

# 4) Mix shares & balance index
drinks <- drinks |>
  mutate(
    total_servings = beer_servings + spirit_servings + wine_servings,
    beer_share     = if_else(total_servings > 0, beer_servings   / total_servings, NA_real_),
    wine_share     = if_else(total_servings > 0, wine_servings   / total_servings, NA_real_),
    spirits_share  = if_else(total_servings > 0, spirit_servings / total_servings, NA_real_),
    hhi            = beer_share^2 + wine_share^2 + spirits_share^2,
    # scale (1 - HHI) to [0,1], where 0 = single-beverage concentrated, 1 = evenly balanced
    mix_balance    = if_else(!is.na(hhi), (1 - hhi) / (1 - 1/3), NA_real_)
  )

# 5) Long format for beverage-specific work
drinks_long <- drinks |>
  pivot_longer(
    cols = c(beer_servings, wine_servings, spirit_servings),
    names_to = "beverage",
    values_to = "servings"
  ) |>
  mutate(beverage = recode(beverage,
                           beer_servings   = "Beer",
                           wine_servings   = "Wine",
                           spirit_servings = "Spirits"))

# 6) Country rankings by beverage
rankings <- drinks_long |>
  group_by(beverage) |>
  arrange(desc(servings), .by_group = TRUE) |>
  mutate(rank_within_beverage = row_number()) |>
  ungroup()

# 7) Overall alcohol ranking
total_rank <- drinks |>
  arrange(desc(total_litres_of_pure_alcohol)) |>
  mutate(total_alcohol_rank = row_number()) |>
  select(country, continent, total_litres_of_pure_alcohol, total_alcohol_rank)

rankings_with_total <- rankings |>
  left_join(total_rank, by = "country")

# 8) Continent profiles
continent_profile <- drinks_long |>
  group_by(continent, beverage) |>
  summarise(
    mean_servings   = mean(servings, na.rm = TRUE),
    median_servings = median(servings, na.rm = TRUE),
    .groups = "drop"
  )

continent_share_profile <- drinks |>
  group_by(continent) |>
  summarise(
    mean_beer_share    = mean(beer_share, na.rm = TRUE),
    mean_wine_share    = mean(wine_share, na.rm = TRUE),
    mean_spirits_share = mean(spirits_share, na.rm = TRUE),
    countries_n        = n(),
    .groups = "drop"
  )

# Top/Bottom 10 by beverage
top10_each <- rankings |>
  group_by(beverage) |>
  slice_head(n = 10) |>
  select(beverage, rank_within_beverage, country, continent, servings)

bottom10_each <- rankings |>
  group_by(beverage) |>
  slice_tail(n = 10) |>
  arrange(beverage, rank_within_beverage)

# Save clean outputs for future analysis/visualization
write_csv(drinks,                   "drinks_clean.csv")
write_csv(drinks_long,              "drinks_long.csv")
write_csv(rankings_with_total,      "drinks_rankings.csv")
write_csv(continent_profile,        "continent_profile.csv")
write_csv(continent_share_profile,  "continent_share_profile.csv")

# Show key tables in the knitted doc
top10_each

## # A tibble: 30 × 5
## # Groups:   beverage [3]
##    beverage rank_within_beverage country        continent servings
##    <chr>                   <int> <chr>          <chr>        <dbl>
##  1 Beer                        1 Namibia        Africa         376
##  2 Beer                        2 Czech Republic Europe         361
##  3 Beer                        3 Gabon          Africa         347
##  4 Beer                        4 Germany        Europe         346
##  5 Beer                        5 Lithuania      Europe         343
##  6 Beer                        6 Poland         Europe         343
##  7 Beer                        7 Venezuela      Americas       333
##  8 Beer                        8 Ireland        Europe         313
##  9 Beer                        9 Palau          Oceania        306
## 10 Beer                       10 Romania        Europe         297
## # ℹ 20 more rows

continent_share_profile

## # A tibble: 6 × 5
##   continent mean_beer_share mean_wine_share mean_spirits_share countries_n
##   <chr>               <dbl>           <dbl>              <dbl>       <int>
## 1 Africa              0.675          0.138               0.188          53
## 2 Americas            0.463          0.0948              0.442          35
## 3 Asia                0.407          0.0993              0.494          47
## 4 Europe              0.413          0.299               0.288          42
## 5 Oceania             0.519          0.135               0.346          15
## 6 <NA>                0.477          0.138               0.385           1

# Country mix balance: top 12 most balanced
drinks |>
  arrange(desc(mix_balance)) |>
  slice_head(n = 12) |>
  ggplot(aes(x = reorder(country, mix_balance), y = mix_balance)) +
  geom_col() +
  coord_flip() +
  labs(title = "Most Balanced Beverage Mix (Top 12)",
       x = NULL, y = "Balance Index (0=concentrated, 1=balanced)")

# Mean servings by continent & beverage
continent_profile |>
  ggplot(aes(x = continent, y = mean_servings, fill = beverage)) +
  geom_col(position = "dodge") +
  labs(title = "Mean Servings per Capita by Continent & Beverage",
       x = "Continent", y = "Mean Servings")

Cleaning steps: • Standardized variable names and trimmed country names • Repaired/created continent using countrycode (robust if the CSV lacks a ‘continent’ column) • Cast numeric columns and guarded against negative values • Computed beverage shares and a balance index (scaled 0–1) • Produced long format for beverage-specific analysis • Built country and continent summaries/rankings

Use: • ‘drinks_long’ for rank plots and beverage-specific visuals • ‘continent_profile’ & ‘continent_share_profile’ for regional comparisons • ‘rankings_with_total’ to relate total alcohol to beverage preference

# cat("
# Data Source (extraction): FiveThirtyEight “Dear Mona” article:
# ", article_url, "
# Raw data: ", data_url ")

Mini Project 1

Andrew Root

2025-10-16