setwd(“C:/Users/emily/OneDrive/바탕 화면/FELS/Use_Misuse of Data/Homework 3”)
#install.packages("gutenbergr")
library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
works<-gutenberg_works()
works_last <- works %>%
mutate(lastname = str_extract(author, "^[^,]+")) %>% #Extract everything before the first comma
arrange(lastname) %>% #Order alphabetically by last name
select(author, lastname, everything()) #Bring author and lastname to the front
head(works_last, 50) #Take a look
## # A tibble: 50 × 9
## author lastname gutenberg_id title gutenberg_author_id language
## <chr> <chr> <int> <chr> <int> <fct>
## 1 "\"Colored Quartet\… "\"Colo… 10269 "Pha… 3425 en
## 2 "A British officer" "A Brit… 59089 "The… 50297 en
## 3 "A Californian" "A Cali… 58890 "The… 50199 en
## 4 "A lady" "A lady" 58794 "The… 50154 en
## 5 "A-No. 1" "A-No. … 12424 "The… 4360 en
## 6 "A. L. O. E." "A. L. … 25516 "The… 31385 en
## 7 "A. L. O. E." "A. L. … 26094 "Heb… 31385 en
## 8 "A. L. O. E." "A. L. … 29863 "The… 31385 en
## 9 "A. L. O. E." "A. L. … 35533 "The… 31385 en
## 10 "A. L. O. E." "A. L. … 35705 "The… 31385 en
## # ℹ 40 more rows
## # ℹ 3 more variables: gutenberg_bookshelf <chr>, rights <fct>, has_text <lgl>
stowe_works <- gutenberg_works() %>%
filter(str_detect(author, "Stowe, Harriet Beecher"), has_text == TRUE) #Troubleshoot
stowe_ids <- stowe_works$gutenberg_id #Extract IDs for work
stowe <- gutenberg_download(stowe_ids, meta_fields = "title") #Download work with those IDs
## Determining mirror for Project Gutenberg from
## https://www.gutenberg.org/robot/harvest.
## Using mirror http://aleph.gutenberg.org.
## Warning: ! Could not download a book at http://aleph.gutenberg.org/1/0/2/7/10279/10279.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
head(stowe_works, 10)
## # A tibble: 10 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <fct> <chr>
## 1 203 "Uncle … Stowe… 115 en US Civil War/Slave…
## 2 2486 "Queer … Stowe… 115 en Browsing: Children…
## 3 6598 "Americ… Stowe… 115 en Technology/Browsin…
## 4 6702 "Life o… Stowe… 115 en Biographies/Browsi…
## 5 6931 "Sunny … Stowe… 115 en Browsing: History …
## 6 10279 "Uncle … Stowe… 115 en Browsing: Culture/…
## 7 10723 "Betty'… Stowe… 115 en Christmas/Browsing…
## 8 11171 "Uncle … Stowe… 115 en Children's Literat…
## 9 12354 "Pink a… Stowe… 115 en Browsing: Culture/…
## 10 13945 "Sunny … Stowe… 115 en Browsing: History …
## # ℹ 2 more variables: rights <fct>, has_text <lgl>
#A row contains the gutenberg id, title, author(which we have filtered to only Harriet B. Stowe), author id, language, bookshelf, rights, and verification that each has text(result of troubleshooting earlier).
water_count <- stowe %>% filter(str_detect(str_to_lower(text), "water")) %>% nrow()
revenge_count <- stowe %>% filter(str_detect(str_to_lower(text), "revenge")) %>% nrow()
water_count #1641
## [1] 1641
revenge_count #64
## [1] 64
#Subset to only lines containing water
stowe_water <- stowe %>%
filter(str_detect(str_to_lower(text), "water"))
#Count by title
stowe_water_count <- stowe_water %>%
count(title, sort = TRUE)
stowe_water_count
## # A tibble: 37 × 2
## title n
## <chr> <int>
## 1 "American Woman's Home: Or, Principles of Domestic Science;\r\nBeing a… 484
## 2 "Household Papers and Stories" 88
## 3 "Sunny Memories of Foreign Lands, Volume 2" 78
## 4 "The Pearl of Orr's Island: A Story of the Coast of Maine" 65
## 5 "House and Home Papers\nSeventh Edition" 60
## 6 "Queer little people" 60
## 7 "Uncle Tom's Cabin" 60
## 8 "Dred: A Tale of the Great Dismal Swamp" 58
## 9 "Palmetto-Leaves" 54
## 10 "Agnes of Sorrento" 53
## # ℹ 27 more rows
#Most represented: American Woman's Home
#Count how many lines per work
stowe_length <- stowe %>%
count(title, sort = TRUE)
# View longest and shortest
head(stowe_length, 1) # Longest is American Woman's Home
## # A tibble: 1 × 2
## title n
## <chr> <int>
## 1 "American Woman's Home: Or, Principles of Domestic Science;\r\nBeing a … 31704
tail(stowe_length, 1) # Shortest is Sunny Memories of Foreign Lands, V1
## # A tibble: 1 × 2
## title n
## <chr> <int>
## 1 Sunny Memories Of Foreign Lands, Volume 1 1
#Create a new variable that marks lines with commas
stowe <- stowe %>%
mutate(has_comma = str_detect(text, ","))
# View a sample of the new variable
head(stowe, 10)
## # A tibble: 10 × 4
## gutenberg_id text title has_comma
## <int> <chr> <chr> <lgl>
## 1 203 "Uncle Tom’s Cabin" Uncle Tom's Cabin FALSE
## 2 203 " or " Uncle Tom's Cabin FALSE
## 3 203 " Life among the Lowly" Uncle Tom's Cabin FALSE
## 4 203 "" Uncle Tom's Cabin FALSE
## 5 203 "by Harriet Beecher Stowe" Uncle Tom's Cabin FALSE
## 6 203 "" Uncle Tom's Cabin FALSE
## 7 203 "" Uncle Tom's Cabin FALSE
## 8 203 "Contents" Uncle Tom's Cabin FALSE
## 9 203 "" Uncle Tom's Cabin FALSE
## 10 203 " VOLUME I" Uncle Tom's Cabin FALSE
# Calculate the percentage of lines that contain commas
comma_percentage <- stowe %>%
summarise(percent_with_comma = mean(has_comma, na.rm = TRUE) * 100)
comma_percentage #About 51%
## # A tibble: 1 × 1
## percent_with_comma
## <dbl>
## 1 51.1
library(ggplot2)
library(lubridate)
stocks <- readRDS("C:/Users/emily/Downloads/stockpricedata.rds")
head(stocks)
## # A tibble: 6 × 8
## symbol date open high low close volume adjusted
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2020-01-02 74.1 75.2 73.8 75.1 135480400 72.9
## 2 AAPL 2020-01-03 74.3 75.1 74.1 74.4 146322800 72.2
## 3 AAPL 2020-01-06 73.4 75.0 73.2 74.9 118387200 72.7
## 4 AAPL 2020-01-07 75.0 75.2 74.4 74.6 108872000 72.4
## 5 AAPL 2020-01-08 74.3 76.1 74.3 75.8 132079200 73.6
## 6 AAPL 2020-01-09 76.8 77.6 76.6 77.4 170108400 75.1
#Plot Apple and Microsoft stock prices over time
ggplot(stocks, aes(x = date, y = close, color = symbol)) +
geom_line(size = 1) +
labs(title = "Apple (AAPL) and Microsoft (MSFT) Stock Prices, Jan 2020–Oct 2021",
subtitle = "Daily Closing Prices",
x = "Date",
y = "Closing Price (USD)",
color = "Company") +
theme_minimal(base_size = 12)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Both stocks appear to show parallel trends. Both show a dip around March 2020 (around early COVID-19 market crash).
#After that, both companies recover and show steady growth through 2021.
#These trends align with increased investor confidence in tech companies during remote work expansion.
#Use floor_date() to round each date down to the start of its week
#Then, group by stock symbol and week to compute the average closing price
stocks_weekly <- stocks %>%
mutate(week = floor_date(date, "week")) %>% # Create 'week' variable
group_by(symbol, week) %>% # Group by company and week
summarize(avg_close = mean(close, na.rm = TRUE)) # Average closing price per week
## `summarise()` has grouped output by 'symbol'. You can override using the
## `.groups` argument.
#Plot weekly average prices
ggplot(stocks_weekly, aes(x = week, y = avg_close, color = symbol)) +
geom_line(size = 1) +
labs(title = "Weekly Average Stock Prices: Apple vs Microsoft",
subtitle = "January 2020 – October 2021",
x = "Week",
y = "Average Closing Price (USD)",
color = "Company") +
theme_minimal(base_size = 12)
#Aggregating to weekly averages smooths out the stock data; still same projection.
#We can see both companies steadily rising across 2020–2021.
#Apple shows slightly more volatility, especially around product release seasons.
##Combo of strings and dates
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.4.3
flights <- flights
flights <- flights %>%
mutate(date = paste(month, day, year, sep = "/")) #Create "M/D/Y" string
#Confirm
head(flights$date)
## [1] "1/1/2013" "1/1/2013" "1/1/2013" "1/1/2013" "1/1/2013" "1/1/2013"
tail(flights$date) #Yup
## [1] "9/30/2013" "9/30/2013" "9/30/2013" "9/30/2013" "9/30/2013" "9/30/2013"
flights <- flights %>%
mutate(formatdate = as.Date(date, format = "%m/%d/%Y"))
#Confirm structure change
str(flights$formatdate)
## Date[1:336776], format: "2013-01-01" "2013-01-01" "2013-01-01" "2013-01-01" "2013-01-01" ...
summary(flights$formatdate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "2013-01-01" "2013-04-04" "2013-07-03" "2013-07-02" "2013-10-01" "2013-12-31"
daily_delays <- flights %>%
group_by(origin, formatdate) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
head(daily_delays)
## # A tibble: 6 × 3
## origin formatdate avg_dep_delay
## <chr> <date> <dbl>
## 1 EWR 2013-01-01 17.5
## 2 EWR 2013-01-02 25.3
## 3 EWR 2013-01-03 8.45
## 4 EWR 2013-01-04 12.1
## 5 EWR 2013-01-05 5.70
## 6 EWR 2013-01-06 12.4
ggplot(daily_delays, aes(x = formatdate, y = avg_dep_delay, color = origin)) +
geom_line() +
labs(title = "Average Daily Departure Delays by Airport",
x = "Date",
y = "Average Departure Delay",
color = "Airport")
#Attempting to make it less...messy
#Groups the dates into weeks
flights_weekly <- flights %>%
mutate(week = lubridate::floor_date(formatdate, "week")) #round each date down to the start of its week
#Calculate average weekly departure delay for each airport
weekly_delays <- flights_weekly %>%
group_by(origin, week) %>%
summarize(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
head(weekly_delays)
## # A tibble: 6 × 3
## origin week avg_dep_delay
## <chr> <date> <dbl>
## 1 EWR 2012-12-30 14.3
## 2 EWR 2013-01-06 6.03
## 3 EWR 2013-01-13 14.3
## 4 EWR 2013-01-20 20.0
## 5 EWR 2013-01-27 19.8
## 6 EWR 2013-02-03 10.8
#Plot weekly average departure delays
ggplot(weekly_delays, aes(x = week, y = avg_dep_delay, color = origin)) +
geom_line() +
labs(title = "Weekly Average Departure Delays by Airport",
x = "Week",
y = "Average Departure Delay",
color = "Airport")
#Somewhat better?
#Thank You