I, Linh Le, hereby state that I have not gained information in any way not allowed by the exam rules during this exam, and that all work is my own.
# load required packages here
library(tidyverse)
library(openintro)
library(ISLR2)
library(splines)
library(gam)
library(ggplot2)
library(gganimate)
library(gifski)
library(transformr)
library(nycflights13)
library(airports)
library(cherryblossom)
library(usdata)
library(sf)
library(dplyr)
library(tidyr)
library(scales)
library(RColorBrewer)
library(units)
library(cowplot)
The following questions shall be answered by working with the
world_bank_pop and who data sets from the
openinto library.
world_bank_pop is not clean. Clean the
data set such that the after data tidying you have six columns:
country, year, SP.URB.TOTL,
SP.URB.GROW, SP.POP.TOTL,
SP.POP.GROW. Give your code and show the first 10 rows of
the data set after being tidied. Then explain the meaning of each
column.# Enter code here.
glimpse(world_bank_pop)
## Rows: 1,064
## Columns: 20
## $ country <chr> "ABW", "ABW", "ABW", "ABW", "AFE", "AFE", "AFE", "AFE", "AFG…
## $ indicator <chr> "SP.URB.TOTL", "SP.URB.GROW", "SP.POP.TOTL", "SP.POP.GROW", …
## $ `2000` <dbl> 4.162500e+04, 1.664222e+00, 8.910100e+04, 2.539234e+00, 1.15…
## $ `2001` <dbl> 4.202500e+04, 9.563731e-01, 9.069100e+04, 1.768757e+00, 1.19…
## $ `2002` <dbl> 4.219400e+04, 4.013352e-01, 9.178100e+04, 1.194718e+00, 1.24…
## $ `2003` <dbl> 4.227700e+04, 1.965172e-01, 9.270100e+04, 9.973955e-01, 1.28…
## $ `2004` <dbl> 4.231700e+04, 9.456936e-02, 9.354000e+04, 9.009892e-01, 1.33…
## $ `2005` <dbl> 4.239900e+04, 1.935880e-01, 9.448300e+04, 1.003077e+00, 1.38…
## $ `2006` <dbl> 4.255500e+04, 3.672580e-01, 9.560600e+04, 1.181566e+00, 1.44…
## $ `2007` <dbl> 4.272900e+04, 4.080490e-01, 9.678700e+04, 1.227711e+00, 1.49…
## $ `2008` <dbl> 4.290600e+04, 4.133830e-01, 9.799600e+04, 1.241397e+00, 1.55…
## $ `2009` <dbl> 4.307900e+04, 4.023963e-01, 9.921200e+04, 1.233231e+00, 1.61…
## $ `2010` <dbl> 4.320600e+04, 2.943735e-01, 1.003410e+05, 1.131541e+00, 1.68…
## $ `2011` <dbl> 4.349300e+04, 6.620631e-01, 1.012880e+05, 9.393559e-01, 1.75…
## $ `2012` <dbl> 4.386400e+04, 8.493932e-01, 1.021120e+05, 8.102306e-01, 1.82…
## $ `2013` <dbl> 4.422800e+04, 8.264135e-01, 1.028800e+05, 7.493010e-01, 1.90…
## $ `2014` <dbl> 4.458800e+04, 8.106692e-01, 1.035940e+05, 6.916153e-01, 1.98…
## $ `2015` <dbl> 4.494300e+04, 7.930256e-01, 1.042570e+05, 6.379592e-01, 2.06…
## $ `2016` <dbl> 4.529700e+04, 7.845785e-01, 1.048740e+05, 5.900625e-01, 2.15…
## $ `2017` <dbl> 4.564800e+04, 7.718989e-01, 1.054390e+05, 5.372957e-01, 2.23…
wbp1<- world_bank_pop %>%
pivot_longer('2000':'2017', names_to = "year", values_to = "sp")%>%
print()
## # A tibble: 19,152 × 4
## country indicator year sp
## <chr> <chr> <chr> <dbl>
## 1 ABW SP.URB.TOTL 2000 41625
## 2 ABW SP.URB.TOTL 2001 42025
## 3 ABW SP.URB.TOTL 2002 42194
## 4 ABW SP.URB.TOTL 2003 42277
## 5 ABW SP.URB.TOTL 2004 42317
## 6 ABW SP.URB.TOTL 2005 42399
## 7 ABW SP.URB.TOTL 2006 42555
## 8 ABW SP.URB.TOTL 2007 42729
## 9 ABW SP.URB.TOTL 2008 42906
## 10 ABW SP.URB.TOTL 2009 43079
## # ℹ 19,142 more rows
wbp2<- wbp1%>%
pivot_wider(names_from = "indicator", values_from = "sp")%>%
print(n=10)
## # A tibble: 4,788 × 6
## country year SP.URB.TOTL SP.URB.GROW SP.POP.TOTL SP.POP.GROW
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ABW 2000 41625 1.66 89101 2.54
## 2 ABW 2001 42025 0.956 90691 1.77
## 3 ABW 2002 42194 0.401 91781 1.19
## 4 ABW 2003 42277 0.197 92701 0.997
## 5 ABW 2004 42317 0.0946 93540 0.901
## 6 ABW 2005 42399 0.194 94483 1.00
## 7 ABW 2006 42555 0.367 95606 1.18
## 8 ABW 2007 42729 0.408 96787 1.23
## 9 ABW 2008 42906 0.413 97996 1.24
## 10 ABW 2009 43079 0.402 99212 1.23
## # ℹ 4,778 more rows
Answer:
country: Three letter country code
year: Value for each year
SP.POP.GROW: population growth
SP.POP.TOTL: total population
SP.URB.GROW: urban population growth
SP.URB.TOTL: total urban population
country column of the tided data set in
step a) with full names of the country (for example, replace
USA with United States of America) by checking
the data frame who, which contains the full name of each
country corresponding to the three-digit country code. Give your code
and show the updated data set in a manner to illustrate that the task is
correctly fulfilled.# Enter code here.
glimpse(who)
## Rows: 7,240
## Columns: 60
## $ country <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
## $ iso2 <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ iso3 <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
## $ year <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198…
## $ new_sp_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f014 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f65 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
glimpse(wbp2)
## Rows: 4,788
## Columns: 6
## $ country <chr> "ABW", "ABW", "ABW", "ABW", "ABW", "ABW", "ABW", "ABW", "A…
## $ year <chr> "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2…
## $ SP.URB.TOTL <dbl> 41625, 42025, 42194, 42277, 42317, 42399, 42555, 42729, 42…
## $ SP.URB.GROW <dbl> 1.66422212, 0.95637310, 0.40133515, 0.19651721, 0.09456936…
## $ SP.POP.TOTL <dbl> 89101, 90691, 91781, 92701, 93540, 94483, 95606, 96787, 97…
## $ SP.POP.GROW <dbl> 2.5392344, 1.7687566, 1.1947181, 0.9973955, 0.9009892, 1.0…
wbp3<-wbp2 %>%
left_join(who, c("country" = "iso3"),relationship = "many-to-many")%>%
dplyr :: select('country.y', 'year.x', 'SP.POP.GROW','SP.POP.TOTL','SP.URB.GROW','SP.URB.TOTL')%>%
rename(country= country.y, year = year.x)%>%
print()
## # A tibble: 126,594 × 6
## country year SP.POP.GROW SP.POP.TOTL SP.URB.GROW SP.URB.TOTL
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba 2000 2.54 89101 1.66 41625
## 2 Aruba 2000 2.54 89101 1.66 41625
## 3 Aruba 2000 2.54 89101 1.66 41625
## 4 Aruba 2000 2.54 89101 1.66 41625
## 5 Aruba 2000 2.54 89101 1.66 41625
## 6 Aruba 2000 2.54 89101 1.66 41625
## 7 Aruba 2000 2.54 89101 1.66 41625
## 8 Aruba 2000 2.54 89101 1.66 41625
## 9 Aruba 2000 2.54 89101 1.66 41625
## 10 Aruba 2000 2.54 89101 1.66 41625
## # ℹ 126,584 more rows
# Enter code here.
wbp4<-wbp3%>%
group_by(country)%>%
summarise(avg_urb_growth_from_2000_2017 = mean(SP.URB.GROW))%>%
arrange(desc(avg_urb_growth_from_2000_2017))%>%
top_n(40, avg_urb_growth_from_2000_2017) %>%
print(n=10)
## # A tibble: 40 × 2
## country avg_urb_growth_from_2000_2017
## <chr> <dbl>
## 1 Qatar 8.43
## 2 Equatorial Guinea 6.72
## 3 United Arab Emirates 6.40
## 4 Burkina Faso 5.88
## 5 Burundi 5.83
## 6 Uganda 5.68
## 7 Mali 5.44
## 8 United Republic of Tanzania 5.16
## 9 Angola 5.14
## 10 Maldives 5.01
## # ℹ 30 more rows
wbp4%>%
filter(avg_urb_growth_from_2000_2017>=5)
ggplot(wbp4)+
stat_summary(mapping = aes(x = country, y = avg_urb_growth_from_2000_2017), geom = "bar")+
coord_flip()+
labs(title = "Average urban growth from 2000-2017 accross different country", x = "Country", y = "Average urban growth from 2000-2017")+
theme(plot.title = element_text(hjust = 0.5))
Answer:
Based on the graph, most of the country has the average urban population growth between 2000 anf 2017 below 5%
Only Qatar, Equatorial Guinea, United Arab Emirates, Burkina Faso, Burundi, Uganda, Mali, United Republic of Tanzania Angola, Maldives have the average urban population growth between 2000 and 2017 above 5%
Qatar, Equatorial Guinea and United Arab Emirates
are the top countries that had undergone significant urbanization
between 2000 and 2017 because their average growth are much higher
compare to the others country
For the following tasks, use data set planes and
flights from the nycflights13 package.
planes data set, only keep planes from
manufacturers that have more than 10 samples in the data set. Then
convert manufacturer column into a factor. Then combine
AIRBUS and AIRBUS INDUSTRIE as a single
category AIRBUS; combine MCDONNELL DOUGLAS,
MCDONNELL DOUGLAS AIRCRAFT CO and
MCDONNELL DOUGLAS CORPORATION into a single category
MCDONNELL. Save your data frame as a new one. Show your
code and the first 10 rows of the updated data frame.# Enter code here.
pm<- planes%>%
group_by(manufacturer)%>%
summarise(n= n())%>%
filter(n>10)
p1<- planes%>%
inner_join(pm,by="manufacturer")
p2<-p1$manufacturer<-factor(p1$manufacturer)
print(p1)
## # A tibble: 3,270 × 10
## tailnum year type manufacturer model engines seats speed engine n
## <chr> <int> <chr> <fct> <chr> <int> <int> <int> <chr> <int>
## 1 N10156 2004 Fixed wing… EMBRAER EMB-… 2 55 NA Turbo… 299
## 2 N102UW 1998 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 3 N103US 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 4 N104UW 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 5 N10575 2002 Fixed wing… EMBRAER EMB-… 2 55 NA Turbo… 299
## 6 N105UW 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 7 N107US 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 8 N108UW 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 9 N109UW 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## 10 N110UW 1999 Fixed wing… AIRBUS INDU… A320… 2 182 NA Turbo… 400
## # ℹ 3,260 more rows
p3<-p1 %>%
mutate(manufacturer = fct_collapse(manufacturer,
"AIRBUS" = c("AIRBUS", "AIRBUS INDUSTRIE"),
"MCDONNELL" = c("MCDONNELL DOUGLAS", "MCDONNELL DOUGLAS AIRCRAFT CO","MCDONNELL DOUGLAS CORPORATION")
))%>%
print(n=10)
## # A tibble: 3,270 × 10
## tailnum year type manufacturer model engines seats speed engine n
## <chr> <int> <chr> <fct> <chr> <int> <int> <int> <chr> <int>
## 1 N10156 2004 Fixed wing… EMBRAER EMB-… 2 55 NA Turbo… 299
## 2 N102UW 1998 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 3 N103US 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 4 N104UW 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 5 N10575 2002 Fixed wing… EMBRAER EMB-… 2 55 NA Turbo… 299
## 6 N105UW 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 7 N107US 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 8 N108UW 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 9 N109UW 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## 10 N110UW 1999 Fixed wing… AIRBUS A320… 2 182 NA Turbo… 400
## # ℹ 3,260 more rows
flights data set with the planes
data set, study how plane models correlate with the flight distance with
proper data visualizations or summary tables. You are required to
summarize your findings concisely in your own words.# Enter code here.
glimpse(planes)
## Rows: 3,322
## Columns: 9
## $ tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
## $ year <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
## $ type <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
## $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
## $ model <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
## $ engines <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ seats <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
## $ speed <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ engine <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
ft<-flights%>%
left_join(planes, by="tailnum")%>%
group_by(model)%>%
summarise(avg_dis = mean(distance))%>%
arrange(desc(avg_dis))%>%
top_n(40, avg_dis) %>%
print()
## # A tibble: 40 × 2
## model avg_dis
## <chr> <dbl>
## 1 A330-243 4983
## 2 767-424ER 3850.
## 3 A319-115 2525.
## 4 777-222 2520
## 5 757-212 2475
## 6 A330-323 2422
## 7 737-890 2402
## 8 737-8FH 2402
## 9 737-990 2402
## 10 737-990ER 2402
## # ℹ 30 more rows
ggplot(ft)+
stat_summary(mapping = aes(x = model, y = avg_dis), geom = "bar")+
coord_flip()+
labs(title = "Average distance accross different model", x = "Model", y = "Average Distance (miles)")+
theme(plot.title = element_text(hjust = 0.5))
Answer:
For the top 40 model with the longest travel distance, ost of the model can fly with the average distance from 1500-2500 miles.
There are afew of them can fly further which is :
777-222: 2520 miles
A319-115: 2525
miles
767-424ER: 3850 miles
A330-243: 4983 miles
For the following tasks, use the data set weather,
flights or planes from the
nycflights13 package.
JFK airport. (Hint: You need to first create a
datetime variable for each hour.)# Enter code here.
ftemp<-flights %>%
dplyr::select(year, month, day, hour, minute,tailnum, origin) %>%
mutate(departure_date = make_date(year, month, day)) %>%
mutate(departure_scheduled = make_datetime(year, month, day, hour, minute, tz = Sys.timezone()))%>%
filter(year ==2013, origin == "JFK")%>%
left_join(weather)%>%
filter(!is.na(temp))%>%
group_by(month)%>%
summarise(avg_temp = mean(temp))%>%
print()
## # A tibble: 12 × 2
## month avg_temp
## <int> <dbl>
## 1 1 36.3
## 2 2 35.2
## 3 3 40.8
## 4 4 51.9
## 5 5 61.1
## 6 6 71.6
## 7 7 80.2
## 8 8 75.3
## 9 9 68.8
## 10 10 61.2
## 11 11 46.0
## 12 12 39.5
ggplot(ftemp)+
stat_summary(mapping = aes(x = as.factor(month), y = avg_temp), geom = "bar")+
labs(title = "Temperature change across the whole year of 2013 at the `JFK` airport", x = "Month", y = "Temperature in F")+
theme(plot.title = element_text(hjust = 0.5))
# Enter code here.
ftemp1<-flights %>%
dplyr::select(year, month, day, hour, minute,tailnum, origin) %>%
mutate(departure_date = make_date(year, month, day)) %>%
mutate(departure_scheduled = make_datetime(year, month, day, hour, minute, tz = Sys.timezone()))%>%
filter(year ==2013, origin == "JFK")%>%
left_join(weather)%>%
filter(!is.na(temp))%>%
group_by(month,day)%>%
summarise(maxtemp= max(temp), mintemp=min(temp), d =maxtemp-mintemp)%>%
arrange(desc(d))%>%
print()
## # A tibble: 364 × 5
## # Groups: month [12]
## month day maxtemp mintemp d
## <int> <int> <dbl> <dbl> <dbl>
## 1 5 8 62.6 13.1 49.5
## 2 4 9 82.9 48.0 34.9
## 3 9 24 73.9 48.0 25.9
## 4 1 31 55.4 30.0 25.4
## 5 11 27 60.8 36.0 24.8
## 6 1 20 55.0 30.9 24.1
## 7 5 24 69.8 46.0 23.8
## 8 4 1 62.1 39.0 23.0
## 9 4 5 63.0 39.9 23.0
## 10 10 2 84.0 61.0 23.0
## # ℹ 354 more rows
Answer: May 8th has the largest different
flights data set. Here overnight
flights are defined as flights that departed between 10pm and 1am, and
having an air time of over 4 hours . Create a categorical variable
overnight_flag with YES or NO as
the possible values. Show your code and the updated data frame.# Enter code here.
ovnight<- flights %>%
dplyr::select(year, month, day, hour, minute,tailnum, origin,air_time) %>%
mutate(departure_date = make_date(year, month, day)) %>%
mutate(departure_scheduled = make_datetime(year, month, day, hour, minute, tz = Sys.timezone()))%>%
filter(!is.na(air_time))%>%
mutate(overnight_flag = ifelse(
hour %in% c(22, 23, 0, 1)& air_time> 4, "YES","NO"))
glimpse(ovnight)
## Rows: 327,346
## Columns: 11
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6,…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", …
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 1…
## $ departure_date <date> 2013-01-01, 2013-01-01, 2013-01-01, 2013-01-01, 2…
## $ departure_scheduled <dttm> 2013-01-01 05:15:00, 2013-01-01 05:29:00, 2013-01…
## $ overnight_flag <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "N…
Answer:
planes data set.# Enter code here.
ovnight1<-ovnight%>%
left_join(planes,by="tailnum")%>%
filter(!is.na(seats))%>%
group_by(overnight_flag)%>%
summarise(avg_seats=mean(seats))%>%
print()
## # A tibble: 2 × 2
## overnight_flag avg_seats
## <chr> <dbl>
## 1 NO 138.
## 2 YES 106.
Answer: This is true because on average overnight
flights use relatively small plane
Answer the following questions with data visualization or summary. You are required to summarize your findings concisely in your own words and support your conclusion with proper graphs or tables.
gss_cat data set, find factors that are
significantly correlated with the reported income.# Enter code here.
glimpse(gss_cat)
## Rows: 21,483
## Columns: 9
## $ year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 20…
## $ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, Mar…
## $ age <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52, 40…
## $ race <fct> White, White, White, White, White, White, White, White, White,…
## $ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicable, …
## $ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,near…
## $ relig <fct> Protestant, Protestant, Protestant, Orthodox-christian, None, …
## $ denom <fct> "Southern baptist", "Baptist-dk which", "No denomination", "No…
## $ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3, 3…
gss_cat1<-gss_cat%>%
filter(!is.na(rincome))
#age
oneway.test(gss_cat1$age ~ gss_cat1$rincome)
##
## One-way analysis of means (not assuming equal variances)
##
## data: gss_cat1$age and gss_cat1$rincome
## F = 190.67, num df = 15.0, denom df = 1934.6, p-value < 2.2e-16
#year
oneway.test(gss_cat1$year ~ gss_cat1$rincome)
##
## One-way analysis of means (not assuming equal variances)
##
## data: gss_cat1$year and gss_cat1$rincome
## F = 14.516, num df = 15.0, denom df = 1972.8, p-value < 2.2e-16
#marital
chisq.test(table(gss_cat1$marital, gss_cat1$rincome))
## Warning in chisq.test(table(gss_cat1$marital, gss_cat1$rincome)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(gss_cat1$marital, gss_cat1$rincome)
## X-squared = 2470.6, df = 75, p-value < 2.2e-16
#race
gss_cat4<- gss_cat1%>%
filter(!is.na(race))
chisq.test(table(gss_cat4$race,gss_cat4$rincome))
## Warning in chisq.test(table(gss_cat4$race, gss_cat4$rincome)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(gss_cat4$race, gss_cat4$rincome)
## X-squared = NaN, df = 45, p-value = NA
#partyid
chisq.test(table(gss_cat1$partyid, gss_cat1$rincome))
## Warning in chisq.test(table(gss_cat1$partyid, gss_cat1$rincome)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(gss_cat1$partyid, gss_cat1$rincome)
## X-squared = 806.11, df = 135, p-value < 2.2e-16
#relig
gss_cat3<- gss_cat1%>%
filter(!is.na(relig))
chisq.test(table(gss_cat3$relig, gss_cat3$rincome))
## Warning in chisq.test(table(gss_cat3$relig, gss_cat3$rincome)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(gss_cat3$relig, gss_cat3$rincome)
## X-squared = NaN, df = 225, p-value = NA
#denom
chisq.test(table(gss_cat1$denom, gss_cat1$rincome))
## Warning in chisq.test(table(gss_cat1$denom, gss_cat1$rincome)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(gss_cat1$denom, gss_cat1$rincome)
## X-squared = 951.2, df = 435, p-value < 2.2e-16
#tvhours
gss_cat2<- gss_cat1%>%
filter(!is.na(tvhours))
oneway.test(gss_cat2$tvhours ~ gss_cat2$rincome)
##
## One-way analysis of means (not assuming equal variances)
##
## data: gss_cat2$tvhours and gss_cat2$rincome
## F = 53.509, num df = 15, denom df = 1004, p-value < 2.2e-16
Answer: (please dont count this answer) Since the p-value is very small, year of survey, age, marital status, party affiliation, denomination, and tvhours are significantly correlated with the reported income.
smoking data set of the openintro
package, find factors that are significantly correlated with the smoking
status and the number of cigarettes smoked per day.# Enter code here.
glimpse(smoking)
## Rows: 1,691
## Columns: 12
## $ gender <fct> Male, Female, Male, Female, Female, Female, Male…
## $ age <int> 38, 42, 40, 40, 39, 37, 53, 44, 40, 41, 72, 49, …
## $ marital_status <fct> Divorced, Single, Married, Married, Married, Mar…
## $ highest_qualification <fct> No Qualification, No Qualification, Degree, Degr…
## $ nationality <fct> British, British, English, English, British, Bri…
## $ ethnicity <fct> White, White, White, White, White, White, White,…
## $ gross_income <fct> "2,600 to 5,200", "Under 2,600", "28,600 to 36,4…
## $ region <fct> The North, The North, The North, The North, The …
## $ smoke <fct> No, Yes, No, No, No, No, Yes, No, Yes, Yes, No, …
## $ amt_weekends <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 15, NA, NA, NA…
## $ amt_weekdays <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 12, NA, NA, NA…
## $ type <fct> , Packets, , , , , Packets, , Hand-Rolled, Packe…
sm<-smoking%>%
filter(!is.na(smoke)|!is.na(ethnicity)|!is.na(amt_weekends)&!is.na(amt_weekdays))%>%
mutate(amt_day = (amt_weekends+amt_weekdays)/2)%>%
print()
## # A tibble: 1,691 × 13
## gender age marital_status highest_qualification nationality ethnicity
## <fct> <int> <fct> <fct> <fct> <fct>
## 1 Male 38 Divorced No Qualification British White
## 2 Female 42 Single No Qualification British White
## 3 Male 40 Married Degree English White
## 4 Female 40 Married Degree English White
## 5 Female 39 Married GCSE/O Level British White
## 6 Female 37 Married GCSE/O Level British White
## 7 Male 53 Married Degree British White
## 8 Male 44 Single Degree English White
## 9 Male 40 Single GCSE/CSE English White
## 10 Female 41 Married No Qualification English White
## # ℹ 1,681 more rows
## # ℹ 7 more variables: gross_income <fct>, region <fct>, smoke <fct>,
## # amt_weekends <int>, amt_weekdays <int>, type <fct>, amt_day <dbl>
#highest_qualification
chisq.test(table(sm$highest_qualification, sm$smoke))
##
## Pearson's Chi-squared test
##
## data: table(sm$highest_qualification, sm$smoke)
## X-squared = 40.281, df = 7, p-value = 1.112e-06
oneway.test(sm$amt_day~ sm$highest_qualification)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$amt_day and sm$highest_qualification
## F = 2.2631, num df = 7.00, denom df = 104.83, p-value = 0.03476
#region
chisq.test(table(sm$region, sm$smoke))
##
## Pearson's Chi-squared test
##
## data: table(sm$region, sm$smoke)
## X-squared = 12.671, df = 6, p-value = 0.04857
oneway.test(sm$amt_day~ sm$region)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$amt_day and sm$region
## F = 2.2964, num df = 6.00, denom df = 129.55, p-value = 0.03861
#gender
chisq.test(table(sm$gender, sm$smoke))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(sm$gender, sm$smoke)
## X-squared = 0.42699, df = 1, p-value = 0.5135
oneway.test(sm$amt_day~ sm$gender)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$amt_day and sm$gender
## F = 14.816, num df = 1.00, denom df = 325.74, p-value = 0.0001427
#age
oneway.test(sm$age~ sm$smoke)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$age and sm$smoke
## F = 99.444, num df = 1.0, denom df = 831.2, p-value < 2.2e-16
cor(sm$age, sm$amt_day)
## [1] NA
#marital_status
chisq.test(table(sm$marital_status, sm$smoke))
##
## Pearson's Chi-squared test
##
## data: table(sm$marital_status, sm$smoke)
## X-squared = 74.98, df = 4, p-value = 2.012e-15
oneway.test(sm$amt_day~ sm$marital_status)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$amt_day and sm$marital_status
## F = 0.74965, num df = 4.000, denom df = 98.133, p-value = 0.5606
#nationality
chisq.test(table(sm$nationality, sm$smoke))
## Warning in chisq.test(table(sm$nationality, sm$smoke)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(sm$nationality, sm$smoke)
## X-squared = 14.929, df = 7, p-value = 0.03692
#oneway.test(sm$amt_day~ sm$nationality) not enough observation
#ethnicity
chisq.test(table(sm$ethnicity, sm$smoke))
## Warning in chisq.test(table(sm$ethnicity, sm$smoke)): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(sm$ethnicity, sm$smoke)
## X-squared = 3.0561, df = 6, p-value = 0.8018
#oneway.test(sm$amt_day~ sm$ethnicity)not enough observation
#gross_income
chisq.test(table(sm$gross_income, sm$smoke))
## Warning in chisq.test(table(sm$gross_income, sm$smoke)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(sm$gross_income, sm$smoke)
## X-squared = 19.835, df = 9, p-value = 0.01896
oneway.test(sm$amt_day~ sm$gross_income)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$amt_day and sm$gross_income
## F = 1.1923, num df = 9.000, denom df = 40.541, p-value = 0.3261
#type
chisq.test(table(sm$type, sm$smoke))
## Warning in chisq.test(table(sm$type, sm$smoke)): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: table(sm$type, sm$smoke)
## X-squared = 1691, df = 4, p-value < 2.2e-16
oneway.test(sm$amt_day~ sm$type)
##
## One-way analysis of means (not assuming equal variances)
##
## data: sm$amt_day and sm$type
## F = 2.018, num df = 3.000, denom df = 36.822, p-value = 0.1283
Answer: (Please dont count this answer)
Highest education level and Region
significantly correlated with the smoking status and
the number of cigarettes smoked per day