library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.6
## v forcats 1.0.1 v stringr 1.6.0
## v ggplot2 4.0.1 v tibble 3.3.1
## v lubridate 1.9.4 v tidyr 1.3.2
## v purrr 1.2.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(dplyr)
library(ggplot2)
library(openintro)
## 载入需要的程序包:airports
## 载入需要的程序包:cherryblossom
## 载入需要的程序包:usdata
library(nycflights13)
library(maps)
##
## 载入程序包:'maps'
##
## The following object is masked from 'package:purrr':
##
## map
glimpse(who)
## Rows: 7,240
## Columns: 60
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan~
## $ iso2 <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF~
## $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "~
## $ year <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198~
## $ new_sp_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sp_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_sn_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ new_ep_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
## $ newrel_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
who1 <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
)%>%
mutate(key=str_replace(key,"newrel","new_rel"))
who1
## # A tibble: 76,046 x 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2
## 7 Afghanistan AF AFG 1997 new_sp_m65 0
## 8 Afghanistan AF AFG 1997 new_sp_f014 5
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36
## # i 76,036 more rows
who2 <- who1 %>%
select(-iso2,-iso3)
who2
## # A tibble: 76,046 x 4
## country year key cases
## <chr> <dbl> <chr> <dbl>
## 1 Afghanistan 1997 new_sp_m014 0
## 2 Afghanistan 1997 new_sp_m1524 10
## 3 Afghanistan 1997 new_sp_m2534 6
## 4 Afghanistan 1997 new_sp_m3544 3
## 5 Afghanistan 1997 new_sp_m4554 5
## 6 Afghanistan 1997 new_sp_m5564 2
## 7 Afghanistan 1997 new_sp_m65 0
## 8 Afghanistan 1997 new_sp_f014 5
## 9 Afghanistan 1997 new_sp_f1524 38
## 10 Afghanistan 1997 new_sp_f2534 36
## # i 76,036 more rows
who3<-who2%>%
separate(key,c('new','type','sexage'))
who3
## # A tibble: 76,046 x 6
## country year new type sexage cases
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 new sp m014 0
## 2 Afghanistan 1997 new sp m1524 10
## 3 Afghanistan 1997 new sp m2534 6
## 4 Afghanistan 1997 new sp m3544 3
## 5 Afghanistan 1997 new sp m4554 5
## 6 Afghanistan 1997 new sp m5564 2
## 7 Afghanistan 1997 new sp m65 0
## 8 Afghanistan 1997 new sp f014 5
## 9 Afghanistan 1997 new sp f1524 38
## 10 Afghanistan 1997 new sp f2534 36
## # i 76,036 more rows
who4<-who3%>%
separate(sexage,c("sex","age"),sep=1)
who4
## # A tibble: 76,046 x 7
## country year new type sex age cases
## <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 new sp m 014 0
## 2 Afghanistan 1997 new sp m 1524 10
## 3 Afghanistan 1997 new sp m 2534 6
## 4 Afghanistan 1997 new sp m 3544 3
## 5 Afghanistan 1997 new sp m 4554 5
## 6 Afghanistan 1997 new sp m 5564 2
## 7 Afghanistan 1997 new sp m 65 0
## 8 Afghanistan 1997 new sp f 014 5
## 9 Afghanistan 1997 new sp f 1524 38
## 10 Afghanistan 1997 new sp f 2534 36
## # i 76,036 more rows
who5<-who4%>%
select(-new)
who5
## # A tibble: 76,046 x 6
## country year type sex age cases
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
## 7 Afghanistan 1997 sp m 65 0
## 8 Afghanistan 1997 sp f 014 5
## 9 Afghanistan 1997 sp f 1524 38
## 10 Afghanistan 1997 sp f 2534 36
## # i 76,036 more rows
tb_sex_age <- who5 %>%
group_by(age, sex) %>%
summarise(total_cases = sum(cases, na.rm = TRUE))
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.
tb_sex_age
## # A tibble: 14 x 3
## # Groups: age [7]
## age sex total_cases
## <chr> <chr> <dbl>
## 1 014 f 884247
## 2 014 m 824004
## 3 1524 f 3421560
## 4 1524 m 4221167
## 5 2534 f 3870651
## 6 2534 m 5765476
## 7 3544 f 3018113
## 8 3544 m 5748603
## 9 4554 f 1960221
## 10 4554 m 4540505
## 11 5564 f 1409548
## 12 5564 m 3390120
## 13 65 f 1342684
## 14 65 m 3000619
ggplot(tb_sex_age, aes(x = age, y = total_cases, fill = sex)) +
geom_col(position = "dodge")
total, the number of female cases is higher than that of male cases The
number of cases varies across age groups Cases are lower in the youngest
age group and higher in the middle and older age groups
avg_tuition <- read.csv("D:/lilith/us_avg_tuition.csv")
glimpse(avg_tuition)
## Rows: 50
## Columns: 13
## $ 锘縎tate <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Co~
## $ X2004.05 <chr> "$5,683", "$4,328", "$5,138", "$5,772", "$5,286", "$4,704", "~
## $ X2005.06 <chr> "$5,841", "$4,633", "$5,416", "$6,082", "$5,528", "$5,407", "~
## $ X2006.07 <chr> "$5,753", "$4,919", "$5,481", "$6,232", "$5,335", "$5,596", "~
## $ X2007.08 <chr> "$6,008", "$5,070", "$5,682", "$6,415", "$5,672", "$6,227", "~
## $ X2008.09 <chr> "$6,475", "$5,075", "$6,058", "$6,417", "$5,898", "$6,284", "~
## $ X2009.10 <chr> "$7,189", "$5,455", "$7,263", "$6,627", "$7,259", "$6,948", "~
## $ X2010.11 <chr> "$8,071", "$5,759", "$8,840", "$6,901", "$8,194", "$7,748", "~
## $ X2011.12 <chr> "$8,452", "$5,762", "$9,967", "$7,029", "$9,436", "$8,316", "~
## $ X2012.13 <chr> "$9,098", "$6,026", "$10,134", "$7,287", "$9,361", "$8,793", ~
## $ X2013.14 <chr> "$9,359", "$6,012", "$10,296", "$7,408", "$9,274", "$9,293", ~
## $ X2014.15 <chr> "$9,496", "$6,149", "$10,414", "$7,606", "$9,187", "$9,299", ~
## $ X2015.16 <chr> "$9,751", "$6,571", "$10,646", "$7,867", "$9,270", "$9,748", ~
at1<-avg_tuition%>%
pivot_longer(
cols = "X2004.05":"X2015.16",
names_to = "year",
values_to = "avg_money",
values_drop_na = TRUE
)
at1
## # A tibble: 600 x 3
## 锘縎tate year avg_money
## <chr> <chr> <chr>
## 1 Alabama X2004.05 $5,683
## 2 Alabama X2005.06 $5,841
## 3 Alabama X2006.07 $5,753
## 4 Alabama X2007.08 $6,008
## 5 Alabama X2008.09 $6,475
## 6 Alabama X2009.10 $7,189
## 7 Alabama X2010.11 $8,071
## 8 Alabama X2011.12 $8,452
## 9 Alabama X2012.13 $9,098
## 10 Alabama X2013.14 $9,359
## # i 590 more rows
at2<-at1%>%
separate(year,c("x","year"),sep=1)%>%
separate(avg_money,c("$","avg_money"),sep=1)%>%
select(-x,-"$")
at2
## # A tibble: 600 x 3
## 锘縎tate year avg_money
## <chr> <chr> <chr>
## 1 Alabama 2004.05 5,683
## 2 Alabama 2005.06 5,841
## 3 Alabama 2006.07 5,753
## 4 Alabama 2007.08 6,008
## 5 Alabama 2008.09 6,475
## 6 Alabama 2009.10 7,189
## 7 Alabama 2010.11 8,071
## 8 Alabama 2011.12 8,452
## 9 Alabama 2012.13 9,098
## 10 Alabama 2013.14 9,359
## # i 590 more rows
names(at2) <- c("state", "year", "avg_money")
at2
## # A tibble: 600 x 3
## state year avg_money
## <chr> <chr> <chr>
## 1 Alabama 2004.05 5,683
## 2 Alabama 2005.06 5,841
## 3 Alabama 2006.07 5,753
## 4 Alabama 2007.08 6,008
## 5 Alabama 2008.09 6,475
## 6 Alabama 2009.10 7,189
## 7 Alabama 2010.11 8,071
## 8 Alabama 2011.12 8,452
## 9 Alabama 2012.13 9,098
## 10 Alabama 2013.14 9,359
## # i 590 more rows
at3<-at2%>%
separate(avg_money,c("thousand","dollar"),sep=",")%>%
unite(avg_money,thousand,dollar,sep="")%>%
mutate(avg_money = as.integer(avg_money))
at3
## # A tibble: 600 x 3
## state year avg_money
## <chr> <chr> <int>
## 1 Alabama 2004.05 5683
## 2 Alabama 2005.06 5841
## 3 Alabama 2006.07 5753
## 4 Alabama 2007.08 6008
## 5 Alabama 2008.09 6475
## 6 Alabama 2009.10 7189
## 7 Alabama 2010.11 8071
## 8 Alabama 2011.12 8452
## 9 Alabama 2012.13 9098
## 10 Alabama 2013.14 9359
## # i 590 more rows
at3%>%
group_by(state)%>%
summarise(total_avg_money= mean(avg_money, na.rm = TRUE))%>%
arrange(desc(total_avg_money))
## # A tibble: 50 x 2
## state total_avg_money
## <chr> <dbl>
## 1 Vermont 13067.
## 2 New Hampshire 12781.
## 3 New Jersey 12054.
## 4 Pennsylvania 11970
## 5 Illinois 11228.
## 6 Michigan 10477
## 7 South Carolina 10377.
## 8 Delaware 10099.
## 9 Massachusetts 10058.
## 10 Ohio 9942.
## # i 40 more rows
Vermont highest, Wyoming lowest
tuition_rate <- at3 %>%
filter(year %in% c("2004.05", "2015.16")) %>%
pivot_wider(names_from = year, values_from = avg_money) %>%
mutate(increase_rate = (`2015.16` - `2004.05`) / `2004.05`) %>%
arrange(desc(increase_rate))
tuition_rate
## # A tibble: 50 x 4
## state `2004.05` `2015.16` increase_rate
## <chr> <int> <int> <dbl>
## 1 Hawaii 4267 10175 1.38
## 2 Colorado 4704 9748 1.07
## 3 Arizona 5138 10646 1.07
## 4 Georgia 4298 8447 0.965
## 5 Nevada 3621 6667 0.841
## 6 Louisiana 4453 7871 0.768
## 7 California 5286 9270 0.754
## 8 Alabama 5683 9751 0.716
## 9 Tennessee 5426 9263 0.707
## 10 Kentucky 5640 9567 0.696
## # i 40 more rows
hawaii fastest, ohio lowest
lab_hw11
flights_data <- flights %>%
mutate(date = as.Date(time_hour)) %>%
filter(origin %in% c("EWR", "JFK", "LGA")) %>%
group_by(dest) %>%
summarise(flights_per_day = n() / n_distinct(date))
airports %>%
semi_join(flights_data, by = c("faa" = "dest")) %>%
left_join(flights_data, by = c("faa" = "dest")) %>%
ggplot(aes(x = lon, y = lat)) +
borders("state") +
geom_point(aes(color = flights_per_day), size = 2) +
scale_color_gradient(low = "red", high = "black") +
coord_quickmap() +
labs(
title = "Average Daily Flights from NYC to U.S. Airports",
x = "Longitude",
y = "Latitude",
color = "Flights per Day"
)
## Warning: `borders()` was deprecated in ggplot2 4.0.0.
## i Please use `annotation_borders()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
glimpse(weather)
## Rows: 26,115
## Columns: 15
## $ origin <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW~
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,~
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ hour <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, ~
## $ temp <dbl> 39.02, 39.02, 39.02, 39.92, 39.02, 37.94, 39.02, 39.92, 39.~
## $ dewp <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.~
## $ humid <dbl> 59.37, 61.63, 64.43, 62.21, 64.43, 67.21, 64.43, 62.21, 62.~
## $ wind_dir <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,~
## $ wind_speed <dbl> 10.35702, 8.05546, 11.50780, 12.65858, 12.65858, 11.50780, ~
## $ wind_gust <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.~
## $ precip <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ pressure <dbl> 1012.0, 1012.3, 1012.5, 1012.2, 1011.9, 1012.4, 1012.2, 101~
## $ visib <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,~
## $ time_hour <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00~
?weather
## 打开httpd帮助服务器… 好了
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, ~
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, ~
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1~
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,~
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,~
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1~
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "~
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4~
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394~
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",~
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",~
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1~
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, ~
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6~
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0~
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0~
hot or cold weather:temp windy weather:wind_speed rainy or snowy:precip/pressure foggy:visib
flights_weather <- flights %>%
left_join(weather, by = c("origin", "time_hour"))%>%
select(tailnum,temp,wind_speed,precip,visib,dep_delay)
flights_weather
## # A tibble: 336,776 x 6
## tailnum temp wind_speed precip visib dep_delay
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 N14228 39.0 12.7 0 10 2
## 2 N24211 39.9 15.0 0 10 4
## 3 N619AA 39.0 15.0 0 10 2
## 4 N804JB 39.0 15.0 0 10 -1
## 5 N668DN 39.9 16.1 0 10 -6
## 6 N39463 39.0 12.7 0 10 -4
## 7 N516JB 37.9 11.5 0 10 -5
## 8 N829AS 39.9 16.1 0 10 -3
## 9 N593JB 37.9 13.8 0 10 -3
## 10 N3ALAA 39.9 16.1 0 10 -2
## # i 336,766 more rows
fw1<-flights_weather%>%
pivot_longer(
cols = temp:visib,
names_to = "weather_type",
values_to = "weather_value",
values_drop_na = TRUE
)
fw1
## # A tibble: 1,340,785 x 4
## tailnum dep_delay weather_type weather_value
## <chr> <dbl> <chr> <dbl>
## 1 N14228 2 temp 39.0
## 2 N14228 2 wind_speed 12.7
## 3 N14228 2 precip 0
## 4 N14228 2 visib 10
## 5 N24211 4 temp 39.9
## 6 N24211 4 wind_speed 15.0
## 7 N24211 4 precip 0
## 8 N24211 4 visib 10
## 9 N619AA 2 temp 39.0
## 10 N619AA 2 wind_speed 15.0
## # i 1,340,775 more rows
fw2 <- flights_weather %>%
mutate(
temp_cate = ifelse(temp > 80, "hot",
ifelse(temp < 32, "cold", "normal_temp")),
wind_cate = ifelse(wind_speed > 20, "high_wind", "normal_wind"),
precip_cate = ifelse(precip > 0, "rainy/snowy", "no_precip"),
visib_cate = ifelse(visib < 2, "foggy", "clear")
)
fw2
## # A tibble: 336,776 x 10
## tailnum temp wind_speed precip visib dep_delay temp_cate wind_cate
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 N14228 39.0 12.7 0 10 2 normal_temp normal_wind
## 2 N24211 39.9 15.0 0 10 4 normal_temp normal_wind
## 3 N619AA 39.0 15.0 0 10 2 normal_temp normal_wind
## 4 N804JB 39.0 15.0 0 10 -1 normal_temp normal_wind
## 5 N668DN 39.9 16.1 0 10 -6 normal_temp normal_wind
## 6 N39463 39.0 12.7 0 10 -4 normal_temp normal_wind
## 7 N516JB 37.9 11.5 0 10 -5 normal_temp normal_wind
## 8 N829AS 39.9 16.1 0 10 -3 normal_temp normal_wind
## 9 N593JB 37.9 13.8 0 10 -3 normal_temp normal_wind
## 10 N3ALAA 39.9 16.1 0 10 -2 normal_temp normal_wind
## # i 336,766 more rows
## # i 2 more variables: precip_cate <chr>, visib_cate <chr>
summary_delay <- fw2 %>%
group_by(temp_cate, wind_cate, precip_cate, visib_cate) %>%
summarise(
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
.groups = "drop"
)%>%
arrange(desc(avg_dep_delay))
summary_delay
## # A tibble: 24 x 5
## temp_cate wind_cate precip_cate visib_cate avg_dep_delay
## <chr> <chr> <chr> <chr> <dbl>
## 1 cold high_wind rainy/snowy clear 77
## 2 hot normal_wind rainy/snowy clear 64.6
## 3 cold high_wind rainy/snowy foggy 64.4
## 4 hot high_wind rainy/snowy clear 54.4
## 5 normal_temp high_wind rainy/snowy foggy 51.7
## 6 normal_temp high_wind no_precip foggy 50.5
## 7 cold normal_wind rainy/snowy foggy 37.2
## 8 cold normal_wind no_precip foggy 34.9
## 9 normal_temp normal_wind rainy/snowy foggy 33.3
## 10 normal_temp high_wind rainy/snowy clear 31.4
## # i 14 more rows
cold weather, high wind ,rainy or snowy ,no foggy will cause the departure delay easilier.
jun_13_2013 <- flights %>%
filter(year == 2013, month == 6, day == 13)
dest_delay <- jun_13_2013 %>%
group_by(dest) %>%
summarise(
avg_arr_delay = mean(arr_delay, na.rm = TRUE)
)
dest_delay
## # A tibble: 89 x 2
## dest avg_arr_delay
## <chr> <dbl>
## 1 ABQ 64
## 2 ACK 8
## 3 ALB -19
## 4 ATL 109.
## 5 AUS 64.3
## 6 AVL 90
## 7 BDL 18.5
## 8 BGR 102
## 9 BHM 158
## 10 BNA 95.9
## # i 79 more rows
map_data <- airports %>%
inner_join(dest_delay, by = c("faa" = "dest"))
map_data
## # A tibble: 85 x 9
## faa name lat lon alt tz dst tzone avg_arr_delay
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 ABQ Albuquerque Interna~ 35.0 -107. 5355 -7 A Amer~ 64
## 2 ACK Nantucket Mem 41.3 -70.1 48 -5 A Amer~ 8
## 3 ALB Albany Intl 42.7 -73.8 285 -5 A Amer~ -19
## 4 ATL Hartsfield Jackson ~ 33.6 -84.4 1026 -5 A Amer~ 109.
## 5 AUS Austin Bergstrom In~ 30.2 -97.7 542 -6 A Amer~ 64.3
## 6 AVL Asheville Regional ~ 35.4 -82.5 2165 -5 A Amer~ 90
## 7 BDL Bradley Intl 41.9 -72.7 173 -5 A Amer~ 18.5
## 8 BGR Bangor Intl 44.8 -68.8 192 -5 A Amer~ 102
## 9 BHM Birmingham Intl 33.6 -86.8 644 -6 A Amer~ 158
## 10 BNA Nashville Intl 36.1 -86.7 599 -6 A Amer~ 95.9
## # i 75 more rows
ggplot(map_data, aes(x = lon, y = lat)) +
borders("state") +
geom_point(
aes(color = avg_arr_delay, size = avg_arr_delay),
alpha = 0.7
) +
scale_color_gradient(low = "blue", high = "red")
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
weather0613 <- weather %>%
filter(year == 2013, month == 6, day == 13)
weather0613
## # A tibble: 72 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 6 13 0 68 55.0 63.2 240 4.60
## 2 EWR 2013 6 13 1 68 54.0 60.8 0 0
## 3 EWR 2013 6 13 2 66.9 55.9 67.8 160 4.60
## 4 EWR 2013 6 13 3 64.9 57.0 75.5 220 5.75
## 5 EWR 2013 6 13 4 66.9 57.0 70.5 0 0
## 6 EWR 2013 6 13 5 66.9 57.0 70.5 220 4.60
## 7 EWR 2013 6 13 6 66.9 57.9 72.8 140 3.45
## 8 EWR 2013 6 13 7 66.9 57.9 72.8 120 6.90
## 9 EWR 2013 6 13 8 66.9 59 75.7 110 4.60
## 10 EWR 2013 6 13 9 66.2 61.0 83.8 NA 10.4
## # i 62 more rows
## # i 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
weather_sum0613 <- weather0613 %>%
group_by(origin) %>%
summarise(
avg_temp = mean(temp, na.rm = TRUE),
avg_wind = mean(wind_speed, na.rm = TRUE),
total_precip = sum(precip, na.rm = TRUE),
avg_visib = mean(visib, na.rm = TRUE),
.groups = "drop"
)
weather_sum0613
## # A tibble: 3 x 5
## origin avg_temp avg_wind total_precip avg_visib
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 65.0 10.3 1.13 8.83
## 2 JFK 63.6 13.6 0.97 8.38
## 3 LGA 66.0 14.5 0.86 9.10