Linh Le

Tidy data for WHO

#step 1: load data:

glimpse(who)

## Rows: 7,240
## Columns: 60
## $ country      <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
## $ iso2         <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ iso3         <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
## $ year         <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198…
## $ new_sp_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

#step 2: pivit longer:

who1 <- who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65,
    names_to = "key",
    values_to = "cases",
    values_drop_na = TRUE
  )

who1

## # A tibble: 76,046 × 6
##    country     iso2  iso3   year key          cases
##    <chr>       <chr> <chr> <dbl> <chr>        <dbl>
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36
## # ℹ 76,036 more rows

#step 3: clean the key 

who2<- who1 %>%
  mutate(key = str_replace(key, "newrel", "new_rel"))

who2

## # A tibble: 76,046 × 6
##    country     iso2  iso3   year key          cases
##    <chr>       <chr> <chr> <dbl> <chr>        <dbl>
##  1 Afghanistan AF    AFG    1997 new_sp_m014      0
##  2 Afghanistan AF    AFG    1997 new_sp_m1524    10
##  3 Afghanistan AF    AFG    1997 new_sp_m2534     6
##  4 Afghanistan AF    AFG    1997 new_sp_m3544     3
##  5 Afghanistan AF    AFG    1997 new_sp_m4554     5
##  6 Afghanistan AF    AFG    1997 new_sp_m5564     2
##  7 Afghanistan AF    AFG    1997 new_sp_m65       0
##  8 Afghanistan AF    AFG    1997 new_sp_f014      5
##  9 Afghanistan AF    AFG    1997 new_sp_f1524    38
## 10 Afghanistan AF    AFG    1997 new_sp_f2534    36
## # ℹ 76,036 more rows

#step 4: seperate to different column

who3 <- who2 %>%
  separate(key, c("new","type", "sexage" ), sep="_")

who3

## # A tibble: 76,046 × 8
##    country     iso2  iso3   year new   type  sexage cases
##    <chr>       <chr> <chr> <dbl> <chr> <chr> <chr>  <dbl>
##  1 Afghanistan AF    AFG    1997 new   sp    m014       0
##  2 Afghanistan AF    AFG    1997 new   sp    m1524     10
##  3 Afghanistan AF    AFG    1997 new   sp    m2534      6
##  4 Afghanistan AF    AFG    1997 new   sp    m3544      3
##  5 Afghanistan AF    AFG    1997 new   sp    m4554      5
##  6 Afghanistan AF    AFG    1997 new   sp    m5564      2
##  7 Afghanistan AF    AFG    1997 new   sp    m65        0
##  8 Afghanistan AF    AFG    1997 new   sp    f014       5
##  9 Afghanistan AF    AFG    1997 new   sp    f1524     38
## 10 Afghanistan AF    AFG    1997 new   sp    f2534     36
## # ℹ 76,036 more rows

#step 5: remove iso2, iso3 and new
who4<- who3%>%
  dplyr ::select(-iso2,-iso3,-new)
who4

## # A tibble: 76,046 × 5
##    country      year type  sexage cases
##    <chr>       <dbl> <chr> <chr>  <dbl>
##  1 Afghanistan  1997 sp    m014       0
##  2 Afghanistan  1997 sp    m1524     10
##  3 Afghanistan  1997 sp    m2534      6
##  4 Afghanistan  1997 sp    m3544      3
##  5 Afghanistan  1997 sp    m4554      5
##  6 Afghanistan  1997 sp    m5564      2
##  7 Afghanistan  1997 sp    m65        0
##  8 Afghanistan  1997 sp    f014       5
##  9 Afghanistan  1997 sp    f1524     38
## 10 Afghanistan  1997 sp    f2534     36
## # ℹ 76,036 more rows

#step 6: seperate age and sex
who5 <- who4 %>%
  separate(sexage, c("sex", "age"), sep = 1)

who5

## # A tibble: 76,046 × 6
##    country      year type  sex   age   cases
##    <chr>       <dbl> <chr> <chr> <chr> <dbl>
##  1 Afghanistan  1997 sp    m     014       0
##  2 Afghanistan  1997 sp    m     1524     10
##  3 Afghanistan  1997 sp    m     2534      6
##  4 Afghanistan  1997 sp    m     3544      3
##  5 Afghanistan  1997 sp    m     4554      5
##  6 Afghanistan  1997 sp    m     5564      2
##  7 Afghanistan  1997 sp    m     65        0
##  8 Afghanistan  1997 sp    f     014       5
##  9 Afghanistan  1997 sp    f     1524     38
## 10 Afghanistan  1997 sp    f     2534     36
## # ℹ 76,036 more rows

TB cases over year

who6<- who5%>%
  filter(year>=2000)

ggplot(who6)+
  stat_summary(aes(x = year, y = cases, fill = age), fun = 'sum', geom = 'bar', position = 'dodge') + 
  labs(title = "TB cases in the world over after 2000", x = "Years", y = "Case Counts") +
  scale_fill_discrete(labels = c("0-14 yrs", "15-24 yrs", "25-34 yrs", "35-44 yrs", "45-54 yrs", "55-64 yrs", "> 65 yrs"))

Tidy and visualize the US_average_tuition data set

us_avg_tuition <- read.csv("/Users/linhle/Downloads/School material/D-Visual/us_avg_tuition.csv")
glimpse(us_avg_tuition)

## Rows: 50
## Columns: 13
## $ State    <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co…
## $ X2004.05 <chr> "$5,683", "$4,328", "$5,138", "$5,772", "$5,286", "$4,704", "…
## $ X2005.06 <chr> "$5,841", "$4,633", "$5,416", "$6,082", "$5,528", "$5,407", "…
## $ X2006.07 <chr> "$5,753", "$4,919", "$5,481", "$6,232", "$5,335", "$5,596", "…
## $ X2007.08 <chr> "$6,008", "$5,070", "$5,682", "$6,415", "$5,672", "$6,227", "…
## $ X2008.09 <chr> "$6,475", "$5,075", "$6,058", "$6,417", "$5,898", "$6,284", "…
## $ X2009.10 <chr> "$7,189", "$5,455", "$7,263", "$6,627", "$7,259", "$6,948", "…
## $ X2010.11 <chr> "$8,071", "$5,759", "$8,840", "$6,901", "$8,194", "$7,748", "…
## $ X2011.12 <chr> "$8,452", "$5,762", "$9,967", "$7,029", "$9,436", "$8,316", "…
## $ X2012.13 <chr> "$9,098", "$6,026", "$10,134", "$7,287", "$9,361", "$8,793", …
## $ X2013.14 <chr> "$9,359", "$6,012", "$10,296", "$7,408", "$9,274", "$9,293", …
## $ X2014.15 <chr> "$9,496", "$6,149", "$10,414", "$7,606", "$9,187", "$9,299", …
## $ X2015.16 <chr> "$9,751", "$6,571", "$10,646", "$7,867", "$9,270", "$9,748", …

Uat1<-us_avg_tuition %>%
  pivot_longer(
    cols = X2004.05:X2015.16,
    names_to = "years",
    values_to = "tuition")

Uat1

## # A tibble: 600 × 3
##    State   years    tuition
##    <chr>   <chr>    <chr>  
##  1 Alabama X2004.05 $5,683 
##  2 Alabama X2005.06 $5,841 
##  3 Alabama X2006.07 $5,753 
##  4 Alabama X2007.08 $6,008 
##  5 Alabama X2008.09 $6,475 
##  6 Alabama X2009.10 $7,189 
##  7 Alabama X2010.11 $8,071 
##  8 Alabama X2011.12 $8,452 
##  9 Alabama X2012.13 $9,098 
## 10 Alabama X2013.14 $9,359 
## # ℹ 590 more rows

Uat2 <- Uat1 %>%
  separate(years, c("X", "years"), sep = 1)%>%
  dplyr ::select(-X)

Uat2

## # A tibble: 600 × 3
##    State   years   tuition
##    <chr>   <chr>   <chr>  
##  1 Alabama 2004.05 $5,683 
##  2 Alabama 2005.06 $5,841 
##  3 Alabama 2006.07 $5,753 
##  4 Alabama 2007.08 $6,008 
##  5 Alabama 2008.09 $6,475 
##  6 Alabama 2009.10 $7,189 
##  7 Alabama 2010.11 $8,071 
##  8 Alabama 2011.12 $8,452 
##  9 Alabama 2012.13 $9,098 
## 10 Alabama 2013.14 $9,359 
## # ℹ 590 more rows

Make an informative visualization of the data to show the average tuition across all years in the data set for each state. Which state has the highest tuition? Which state has the lowest?

Uat3<- Uat2 %>%
  mutate(tuition_clean = as.numeric(str_remove_all(tuition, "[$,]")))
Uat3

## # A tibble: 600 × 4
##    State   years   tuition tuition_clean
##    <chr>   <chr>   <chr>           <dbl>
##  1 Alabama 2004.05 $5,683           5683
##  2 Alabama 2005.06 $5,841           5841
##  3 Alabama 2006.07 $5,753           5753
##  4 Alabama 2007.08 $6,008           6008
##  5 Alabama 2008.09 $6,475           6475
##  6 Alabama 2009.10 $7,189           7189
##  7 Alabama 2010.11 $8,071           8071
##  8 Alabama 2011.12 $8,452           8452
##  9 Alabama 2012.13 $9,098           9098
## 10 Alabama 2013.14 $9,359           9359
## # ℹ 590 more rows

Uat4 <-Uat3 %>%
group_by(State)%>%
  summarise(Average_tuition_of_all_years = mean(tuition_clean, na.rm = TRUE))%>%
  arrange(Average_tuition_of_all_years)
  
Uat4

## # A tibble: 50 × 2
##    State          Average_tuition_of_all_years
##    <chr>                                 <dbl>
##  1 Wyoming                               4307.
##  2 Florida                               5131.
##  3 Utah                                  5228.
##  4 Nevada                                5367 
##  5 Louisiana                             5478.
##  6 Alaska                                5480.
##  7 New Mexico                            5485.
##  8 North Carolina                        5589.
##  9 Idaho                                 5633.
## 10 West Virginia                         5652.
## # ℹ 40 more rows

ggplot(Uat4) +
  geom_bar(aes(x = reorder(State, Average_tuition_of_all_years), 
               y = Average_tuition_of_all_years), 
           stat = "identity") +
  coord_flip() +
  labs(x = "State", y = "Average Tuition of All Years")

Answer : We can tell Vermort has the highest average tuition over years and Wyoming has the lowest one

Make an informative visualization of the data to show the increasing rate of average tuition from 2004-2005 to 2015-2016 in each state. Which state’s tuition increased at the fastest rate? Which one the slowest?

Uat5 <-Uat3 %>%
  filter(years == "2004.05" | years== "2015.16") %>%
  dplyr ::select(State, years, tuition_clean ) %>%
  group_by(State) %>%
  summarize(ir = (tuition_clean[years == "2015.16"] / tuition_clean[years == "2004.05"])^(1/11) - 1)
 


Uat5

## # A tibble: 50 × 2
##    State           ir
##    <chr>        <dbl>
##  1 Alabama     0.0503
##  2 Alaska      0.0387
##  3 Arizona     0.0685
##  4 Arkansas    0.0286
##  5 California  0.0524
##  6 Colorado    0.0685
##  7 Connecticut 0.0329
##  8 Delaware    0.0309
##  9 Florida     0.0467
## 10 Georgia     0.0633
## # ℹ 40 more rows

ggplot(Uat5) +
  geom_bar(aes(x = reorder(State, ir), 
               y = ir), 
           stat = "identity") +
  coord_flip() +
  labs(x = "State", title = "Increasing rate of average tuition from 2004-2005 to 2015-2016 in each state", y = "Increasing rate")

Answer : Hawaii’s tuition increased at the fastest rate and Ohio is the slowest

Create a airport map with each airport location marked on the map and colored by the number of flights per day from NYC to each airport

because I don’t know which day you meant so I made to graph, one is average flights per day and the other one is just flights per day but across 12 days.

fpd<- flights%>%
  group_by(year, month, day, dest)%>%
  summarise(flight_per_day = n(), .groups = "drop")%>%
  group_by(dest) %>%
  summarise(avg_flights_per_day = mean(flight_per_day))

fpd

## # A tibble: 105 × 2
##    dest  avg_flights_per_day
##    <chr>               <dbl>
##  1 ABQ                  1   
##  2 ACK                  1.71
##  3 ALB                  1.69
##  4 ANC                  1   
##  5 ATL                 47.2 
##  6 AUS                  6.68
##  7 AVL                  1.11
##  8 BDL                  1.76
##  9 BGR                  1.44
## 10 BHM                  1   
## # ℹ 95 more rows

airports %>%
  semi_join(fpd, c("faa" = "dest")) %>%
  left_join(fpd, by = c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point(aes(color = avg_flights_per_day) ) +
    coord_quickmap()+
  labs(
      title = "Average Number of Flights Per Day from NYC",
      color = "Avg Flights/Day",
      x = "Longitude",
      y = "Latitude"
    )

## Warning: `borders()` was deprecated in ggplot2 4.0.0.
## ℹ Please use `annotation_borders()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fpd_12<- flights%>%
  group_by(year, month, day, dest)%>%
  summarise(flight_per_day = n(), .groups = "drop")

fpd_12

## # A tibble: 31,229 × 5
##     year month   day dest  flight_per_day
##    <int> <int> <int> <chr>          <int>
##  1  2013     1     1 ALB                3
##  2  2013     1     1 ATL               40
##  3  2013     1     1 AUS                6
##  4  2013     1     1 AVL                1
##  5  2013     1     1 BDL                2
##  6  2013     1     1 BNA               10
##  7  2013     1     1 BOS               25
##  8  2013     1     1 BQN                3
##  9  2013     1     1 BTV                7
## 10  2013     1     1 BUF               16
## # ℹ 31,219 more rows

airports %>%
  semi_join(fpd_12, c("faa" = "dest")) %>%
  left_join(fpd_12, by = c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point(aes(color =flight_per_day ) ) +
    coord_quickmap()+
  labs(
      title = "Flights Per Day accross 12 days from NYC",
      color = " Flights/Day",
      x = "Longitude",
      y = "Latitude"
    )

What weather conditions make it more likely to see a departure delay? hot or cold weather? windy weather? rainy or snowy? foggy? Create a proper data frame and use proper visualization or summary table to answer the question.

glimpse(weather)

## Rows: 26,115
## Columns: 15
## $ origin     <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW…
## $ year       <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
## $ month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ day        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hour       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, …
## $ temp       <dbl> 39.02, 39.02, 39.02, 39.92, 39.02, 37.94, 39.02, 39.92, 39.…
## $ dewp       <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.…
## $ humid      <dbl> 59.37, 61.63, 64.43, 62.21, 64.43, 67.21, 64.43, 62.21, 62.…
## $ wind_dir   <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,…
## $ wind_speed <dbl> 10.35702, 8.05546, 11.50780, 12.65858, 12.65858, 11.50780, …
## $ wind_gust  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.…
## $ precip     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pressure   <dbl> 1012.0, 1012.3, 1012.5, 1012.2, 1011.9, 1012.4, 1012.2, 101…
## $ visib      <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
## $ time_hour  <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00…

fw<- flights %>% 
  left_join(weather)

## Joining with `by = join_by(year, month, day, origin, hour, time_hour)`

fw1 <- fw%>%
  group_by(dep_delay, temp, wind_speed, precip, visib )%>%
  dplyr ::select(dep_delay, temp, wind_speed, precip, visib)%>%
  arrange(desc(dep_delay))
  
fw1_c <- fw1%>%
  filter(!is.na(dep_delay),!is.na(temp),!is.na(wind_speed),!is.na(precip),!is.na(visib))

tribble(
  ~Delay_by, ~cor,
  #--|--|----
  "Temp", cor(fw1_c$dep_delay, fw1_c$temp),
  "Wind speed", cor(fw1_c$dep_delay, fw1_c$wind_speed),
  "Precipitation", cor(fw1_c$dep_delay, fw1_c$precip),
  "Visibility",cor(fw1_c$dep_delay, fw1_c$visib)
)

## # A tibble: 4 × 2
##   Delay_by          cor
##   <chr>           <dbl>
## 1 Temp           0.0615
## 2 Wind speed     0.0474
## 3 Precipitation  0.0904
## 4 Visibility    -0.0942

Based on the correlation, the precipitation and the visibility might be more likely to see a departure delay but It seems like the correlation is quite weak here so actually none of the weather variables meaningfully predict flight delays based on this method.

Display the spatial pattern of arrival delays on June 13, 2013 using a map, and then use Google to cross-reference with the weather. Explain how the weather condition might have affected the spatial pattern of arrival delays.

flights %>%
  filter(!is.na(arr_delay), year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarise(avg_delay = mean(arr_delay)) -> delay_data

airports %>%
  semi_join(delay_data, by = c("faa" = "dest")) %>%
  left_join(delay_data, by = c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point(aes(color = avg_delay)) +
    scale_color_gradient(   low = "green",   high = "red") +
    coord_quickmap()

Answer: It seems like the there is the high delay in the Northeast/Mid-Atlantic region (NYC, Boston, Philadelphia, DC area). and by google, A derecho formed in the Midwest on June 12, tracking eastward into Ohio, Pennsylvania, and the Mid-Atlantic states. It produced damaging winds, hail up to 2.75 inches (7.0 cm) in diameter, and five tornadoes in Iowa.

=> This is the reason why they need to delay the flight there

Linh Le - DV HW 3

2026-03-22