Homework 3

setwd(“C:/Users/emily/OneDrive/바탕 화면/FELS/Use_Misuse of Data/Homework 3”)

Strings

#install.packages("gutenbergr")
library(gutenbergr)

## Warning: package 'gutenbergr' was built under R version 4.4.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.2

## Warning: package 'tidyr' was built under R version 4.4.2

## Warning: package 'lubridate' was built under R version 4.4.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

works<-gutenberg_works()
works_last <- works %>%
  mutate(lastname = str_extract(author, "^[^,]+")) %>% #Extract everything before the first comma
  arrange(lastname) %>% #Order alphabetically by last name
  select(author, lastname, everything()) #Bring author and lastname to the front
head(works_last, 50) #Take a look

## # A tibble: 50 × 9
##    author               lastname gutenberg_id title gutenberg_author_id language
##    <chr>                <chr>           <int> <chr>               <int> <fct>   
##  1 "\"Colored Quartet\… "\"Colo…        10269 "Pha…                3425 en      
##  2 "A British officer"  "A Brit…        59089 "The…               50297 en      
##  3 "A Californian"      "A Cali…        58890 "The…               50199 en      
##  4 "A lady"             "A lady"        58794 "The…               50154 en      
##  5 "A-No. 1"            "A-No. …        12424 "The…                4360 en      
##  6 "A. L. O. E."        "A. L. …        25516 "The…               31385 en      
##  7 "A. L. O. E."        "A. L. …        26094 "Heb…               31385 en      
##  8 "A. L. O. E."        "A. L. …        29863 "The…               31385 en      
##  9 "A. L. O. E."        "A. L. …        35533 "The…               31385 en      
## 10 "A. L. O. E."        "A. L. …        35705 "The…               31385 en      
## # ℹ 40 more rows
## # ℹ 3 more variables: gutenberg_bookshelf <chr>, rights <fct>, has_text <lgl>

stowe_works <- gutenberg_works() %>%
  filter(str_detect(author, "Stowe, Harriet Beecher"), has_text == TRUE) #Troubleshoot
stowe_ids <- stowe_works$gutenberg_id #Extract IDs for work
stowe <- gutenberg_download(stowe_ids, meta_fields = "title") #Download work with those IDs

## Determining mirror for Project Gutenberg from
## https://www.gutenberg.org/robot/harvest.
## Using mirror http://aleph.gutenberg.org.

## Warning: ! Could not download a book at http://aleph.gutenberg.org/1/0/2/7/10279/10279.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.

head(stowe_works, 10)

## # A tibble: 10 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <fct>    <chr>              
##  1          203 "Uncle … Stowe…                 115 en       US Civil War/Slave…
##  2         2486 "Queer … Stowe…                 115 en       Browsing: Children…
##  3         6598 "Americ… Stowe…                 115 en       Technology/Browsin…
##  4         6702 "Life o… Stowe…                 115 en       Biographies/Browsi…
##  5         6931 "Sunny … Stowe…                 115 en       Browsing: History …
##  6        10279 "Uncle … Stowe…                 115 en       Browsing: Culture/…
##  7        10723 "Betty'… Stowe…                 115 en       Christmas/Browsing…
##  8        11171 "Uncle … Stowe…                 115 en       Children's Literat…
##  9        12354 "Pink a… Stowe…                 115 en       Browsing: Culture/…
## 10        13945 "Sunny … Stowe…                 115 en       Browsing: History …
## # ℹ 2 more variables: rights <fct>, has_text <lgl>

#A row contains the gutenberg id, title, author(which we have filtered to only Harriet B. Stowe), author id, language, bookshelf, rights, and verification that each has text(result of troubleshooting earlier).

water_count <- stowe %>% filter(str_detect(str_to_lower(text), "water")) %>% nrow()
revenge_count <- stowe %>% filter(str_detect(str_to_lower(text), "revenge")) %>% nrow()

water_count #1641

## [1] 1641

revenge_count #64

## [1] 64

#Subset to only lines containing water
stowe_water <- stowe %>%
  filter(str_detect(str_to_lower(text), "water"))
#Count by title
stowe_water_count <- stowe_water %>%
  count(title, sort = TRUE)
stowe_water_count

## # A tibble: 37 × 2
##    title                                                                       n
##    <chr>                                                                   <int>
##  1 "American Woman's Home: Or, Principles of Domestic Science;\r\nBeing a…   484
##  2 "Household Papers and Stories"                                             88
##  3 "Sunny Memories of Foreign Lands, Volume 2"                                78
##  4 "The Pearl of Orr's Island: A Story of the Coast of Maine"                 65
##  5 "House and Home Papers\nSeventh Edition"                                   60
##  6 "Queer little people"                                                      60
##  7 "Uncle Tom's Cabin"                                                        60
##  8 "Dred: A Tale of the Great Dismal Swamp"                                   58
##  9 "Palmetto-Leaves"                                                          54
## 10 "Agnes of Sorrento"                                                        53
## # ℹ 27 more rows

#Most represented: American Woman's Home

#Count how many lines per work
stowe_length <- stowe %>%
  count(title, sort = TRUE)
# View longest and shortest
head(stowe_length, 1)   # Longest is American Woman's Home

## # A tibble: 1 × 2
##   title                                                                        n
##   <chr>                                                                    <int>
## 1 "American Woman's Home: Or, Principles of Domestic Science;\r\nBeing a … 31704

tail(stowe_length, 1)   # Shortest is Sunny Memories of Foreign Lands, V1

## # A tibble: 1 × 2
##   title                                         n
##   <chr>                                     <int>
## 1 Sunny Memories Of Foreign Lands, Volume 1     1

#Create a new variable that marks lines with commas
stowe <- stowe %>%
  mutate(has_comma = str_detect(text, ","))
# View a sample of the new variable
head(stowe, 10)

## # A tibble: 10 × 4
##    gutenberg_id text                       title             has_comma
##           <int> <chr>                      <chr>             <lgl>    
##  1          203 "Uncle Tom’s Cabin"        Uncle Tom's Cabin FALSE    
##  2          203 " or "                     Uncle Tom's Cabin FALSE    
##  3          203 " Life among the Lowly"    Uncle Tom's Cabin FALSE    
##  4          203 ""                         Uncle Tom's Cabin FALSE    
##  5          203 "by Harriet Beecher Stowe" Uncle Tom's Cabin FALSE    
##  6          203 ""                         Uncle Tom's Cabin FALSE    
##  7          203 ""                         Uncle Tom's Cabin FALSE    
##  8          203 "Contents"                 Uncle Tom's Cabin FALSE    
##  9          203 ""                         Uncle Tom's Cabin FALSE    
## 10          203 " VOLUME I"                Uncle Tom's Cabin FALSE

# Calculate the percentage of lines that contain commas
comma_percentage <- stowe %>%
  summarise(percent_with_comma = mean(has_comma, na.rm = TRUE) * 100)

comma_percentage #About 51%

## # A tibble: 1 × 1
##   percent_with_comma
##                <dbl>
## 1               51.1

Dates

library(ggplot2)
library(lubridate)
stocks <- readRDS("C:/Users/emily/Downloads/stockpricedata.rds")
head(stocks)

## # A tibble: 6 × 8
##   symbol date        open  high   low close    volume adjusted
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>
## 1 AAPL   2020-01-02  74.1  75.2  73.8  75.1 135480400     72.9
## 2 AAPL   2020-01-03  74.3  75.1  74.1  74.4 146322800     72.2
## 3 AAPL   2020-01-06  73.4  75.0  73.2  74.9 118387200     72.7
## 4 AAPL   2020-01-07  75.0  75.2  74.4  74.6 108872000     72.4
## 5 AAPL   2020-01-08  74.3  76.1  74.3  75.8 132079200     73.6
## 6 AAPL   2020-01-09  76.8  77.6  76.6  77.4 170108400     75.1

#Plot Apple and Microsoft stock prices over time
ggplot(stocks, aes(x = date, y = close, color = symbol)) +
  geom_line(size = 1) +
  labs(title = "Apple (AAPL) and Microsoft (MSFT) Stock Prices, Jan 2020–Oct 2021",
       subtitle = "Daily Closing Prices",
       x = "Date",
       y = "Closing Price (USD)",
       color = "Company") +
  theme_minimal(base_size = 12)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Both stocks appear to show parallel trends. Both show a dip around March 2020 (around early COVID-19 market crash).
#After that, both companies recover and show steady growth through 2021.
#These trends align with increased investor confidence in tech companies during remote work expansion.

#Use floor_date() to round each date down to the start of its week
#Then, group by stock symbol and week to compute the average closing price
stocks_weekly <- stocks %>%
  mutate(week = floor_date(date, "week")) %>% # Create 'week' variable
  group_by(symbol, week) %>% # Group by company and week
  summarize(avg_close = mean(close, na.rm = TRUE)) # Average closing price per week

## `summarise()` has grouped output by 'symbol'. You can override using the
## `.groups` argument.

#Plot weekly average prices
ggplot(stocks_weekly, aes(x = week, y = avg_close, color = symbol)) +
  geom_line(size = 1) +
  labs(title = "Weekly Average Stock Prices: Apple vs Microsoft",
       subtitle = "January 2020 – October 2021",
       x = "Week",
       y = "Average Closing Price (USD)",
       color = "Company") +
  theme_minimal(base_size = 12)

#Aggregating to weekly averages smooths out the stock data; still same projection.
#We can see both companies steadily rising across 2020–2021.
#Apple shows slightly more volatility, especially around product release seasons.

##Combo of strings and dates

library(nycflights13)

## Warning: package 'nycflights13' was built under R version 4.4.3

flights <- flights

flights <- flights %>%
  mutate(date = paste(month, day, year, sep = "/")) #Create "M/D/Y" string
#Confirm
head(flights$date)

## [1] "1/1/2013" "1/1/2013" "1/1/2013" "1/1/2013" "1/1/2013" "1/1/2013"

tail(flights$date) #Yup

## [1] "9/30/2013" "9/30/2013" "9/30/2013" "9/30/2013" "9/30/2013" "9/30/2013"

flights <- flights %>%
  mutate(formatdate = as.Date(date, format = "%m/%d/%Y"))
#Confirm structure change
str(flights$formatdate)

##  Date[1:336776], format: "2013-01-01" "2013-01-01" "2013-01-01" "2013-01-01" "2013-01-01" ...

summary(flights$formatdate)

##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "2013-01-01" "2013-04-04" "2013-07-03" "2013-07-02" "2013-10-01" "2013-12-31"

daily_delays <- flights %>%
  group_by(origin, formatdate) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.

head(daily_delays)

## # A tibble: 6 × 3
##   origin formatdate avg_dep_delay
##   <chr>  <date>             <dbl>
## 1 EWR    2013-01-01         17.5 
## 2 EWR    2013-01-02         25.3 
## 3 EWR    2013-01-03          8.45
## 4 EWR    2013-01-04         12.1 
## 5 EWR    2013-01-05          5.70
## 6 EWR    2013-01-06         12.4

ggplot(daily_delays, aes(x = formatdate, y = avg_dep_delay, color = origin)) +
  geom_line() +
  labs(title = "Average Daily Departure Delays by Airport",
       x = "Date",
       y = "Average Departure Delay",
       color = "Airport")

#Attempting to make it less...messy

#Groups the dates into weeks
flights_weekly <- flights %>%
  mutate(week = lubridate::floor_date(formatdate, "week")) #round each date down to the start of its week

#Calculate average weekly departure delay for each airport
weekly_delays <- flights_weekly %>%
  group_by(origin, week) %>%
  summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.

head(weekly_delays)

## # A tibble: 6 × 3
##   origin week       avg_dep_delay
##   <chr>  <date>             <dbl>
## 1 EWR    2012-12-30         14.3 
## 2 EWR    2013-01-06          6.03
## 3 EWR    2013-01-13         14.3 
## 4 EWR    2013-01-20         20.0 
## 5 EWR    2013-01-27         19.8 
## 6 EWR    2013-02-03         10.8

#Plot weekly average departure delays
ggplot(weekly_delays, aes(x = week, y = avg_dep_delay, color = origin)) +
  geom_line() +
  labs(title = "Weekly Average Departure Delays by Airport",
       x = "Week",
       y = "Average Departure Delay",
       color = "Airport")

#Somewhat better?

#Thank You

Homework 3

Emily

Strings

Dates