Midterm

library(robotstxt)

# Check if scraping is allowed for the page
paths_allowed("https://en.wikipedia.org/wiki/List_of_current_United_States_governors")

##  en.wikipedia.org

## [1] TRUE

library(rvest)
url  <- "https://en.wikipedia.org/wiki/List_of_current_United_States_governors"
page <- read_html(url)

# Step 2: select tables & Step 3: parse all
tables <- html_elements(page, "table.wikitable")
dfs    <- html_table(tables, fill = TRUE)

# Step 4: pick the table by its caption
caps <- html_text2(html_elements(tables, "caption"))
ix   <- which(grepl("^Current state governors", caps, ignore.case = TRUE))

governors <- dfs[[ix]]       

# Step 5: quick clean/verify (optional)
names(governors) <- trimws(names(governors))
governors <- subset(governors, governors[[1]] != "")  
class(governors)

## [1] "tbl_df"     "tbl"        "data.frame"

head(governors)

## # A tibble: 6 × 10
##   State             Image `Governor[14]`         `Party[14]` `Party[14]` Born   
##   <chr>             <lgl> <chr>                  <lgl>       <chr>       <chr>  
## 1 Alabama (list)    NA    Kay Ivey               NA          Republican  (1944-…
## 2 Alaska (list)     NA    Mike Dunleavy          NA          Republican  (1961-…
## 3 Arizona (list)    NA    Katie Hobbs            NA          Democratic  (1969-…
## 4 Arkansas (list)   NA    Sarah Huckabee Sanders NA          Republican  (1982-…
## 5 California (list) NA    Gavin Newsom           NA          Democratic  (1967-…
## 6 Colorado (list)   NA    Jared Polis            NA          Democratic  (1975-…
## # ℹ 4 more variables: `Prior public experience[15]` <chr>,
## #   `Inauguration[14]` <chr>, `End of term[14]` <chr>, Ref. <chr>

names(governors)[sapply(governors, function(x) all(is.na(x)))]

## [1] "Image"     "Party[14]"

# Step 1: remove columns that are entirely NA
governors <- governors[, colSums(is.na(governors)) < nrow(governors)]

# Step 2: rename columns by stripping out numbers inside [ ]
names(governors) <- gsub("\\[.*\\]", "", names(governors))

# Step 3: show all column names for a double-check
names(governors)

## [1] "State"                   "Governor"               
## [3] "Party"                   "Born"                   
## [5] "Prior public experience" "Inauguration"           
## [7] "End of term"             "Ref."

# Packages
library(jsonlite)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Build the URL per docs: Daily Pair OHLCV, last 150 days, XMR vs USD
url <- "https://min-api.cryptocompare.com/data/v2/histoday?fsym=XMR&tsym=USD&limit=150"

# Parse JSON → list → the data sublist → data frame/tibble
xmr <- fromJSON(url)$Data$Data

# Compute the minimum daily open and when it occurred
min_open <- min(xmr$open, na.rm = TRUE)
when_min <- xmr %>%
  filter(open == min_open) %>%
  slice(1) %>%                    # in case of ties
  pull(time) %>%
  as.POSIXct(origin = "1970-01-01", tz = "UTC")

min_open

## [1] 235.68

when_min

## [1] "2025-08-16 UTC"

as.POSIXct(xmr$time[which.min(xmr$open)], 
           origin = "1970-01-01", 
           tz = "America/New_York")

## [1] "2025-08-15 20:00:00 EDT"

data(airquality)

summary(airquality)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

data(airquality)

hot_days <- subset(airquality, Wind < 8 & Temp > 90)
nrow(hot_days)

## [1] 9

# Load dataset
data(airquality)

# Plot Temp by Month
plot(airquality$Month, airquality$Temp,
     xlab = "Month", ylab = "Temperature",
     main = "Temperature by Month",
     col = "blue", pch = 19)

library(nycflights13)
library(dplyr)

flights %>%
  group_by(origin) %>%
  summarise(total_delay = sum(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(total_delay))

## # A tibble: 3 × 2
##   origin total_delay
##   <chr>        <dbl>
## 1 EWR        1776635
## 2 JFK        1325264
## 3 LGA        1050301

library(nycflights13)
library(dplyr)

# Count how many flights per origin airport
flights %>%
  count(origin, sort = TRUE)

## # A tibble: 3 × 2
##   origin      n
##   <chr>   <int>
## 1 EWR    120835
## 2 JFK    111279
## 3 LGA    104662

Midterm

Avery Quinn

2025-10-09