Academic Honesty Statement

I, Linh Le, hereby state that I have not gained information in any way not allowed by the exam rules during this exam, and that all work is my own.

Load packages

# load required packages here
library(tidyverse)
library(openintro)
library(ISLR2)
library(splines)
library(gam)
library(ggplot2)
library(gganimate)
library(gifski)
library(transformr)
library(nycflights13)
library(airports)
library(cherryblossom)
library(usdata)
library(sf)           
library(dplyr)        
library(tidyr)        
library(scales)       
library(RColorBrewer) 
library(units)
library(cowplot)

1. Data Tidying and Relational Data

The following questions shall be answered by working with the world_bank_pop and who data sets from the openinto library.

  1. The data set world_bank_pop is not clean. Clean the data set such that the after data tidying you have six columns: country, year, SP.URB.TOTL, SP.URB.GROW, SP.POP.TOTL, SP.POP.GROW. Give your code and show the first 10 rows of the data set after being tidied. Then explain the meaning of each column.
# Enter code here.

glimpse(world_bank_pop)
## Rows: 1,064
## Columns: 20
## $ country   <chr> "ABW", "ABW", "ABW", "ABW", "AFE", "AFE", "AFE", "AFE", "AFG…
## $ indicator <chr> "SP.URB.TOTL", "SP.URB.GROW", "SP.POP.TOTL", "SP.POP.GROW", …
## $ `2000`    <dbl> 4.162500e+04, 1.664222e+00, 8.910100e+04, 2.539234e+00, 1.15…
## $ `2001`    <dbl> 4.202500e+04, 9.563731e-01, 9.069100e+04, 1.768757e+00, 1.19…
## $ `2002`    <dbl> 4.219400e+04, 4.013352e-01, 9.178100e+04, 1.194718e+00, 1.24…
## $ `2003`    <dbl> 4.227700e+04, 1.965172e-01, 9.270100e+04, 9.973955e-01, 1.28…
## $ `2004`    <dbl> 4.231700e+04, 9.456936e-02, 9.354000e+04, 9.009892e-01, 1.33…
## $ `2005`    <dbl> 4.239900e+04, 1.935880e-01, 9.448300e+04, 1.003077e+00, 1.38…
## $ `2006`    <dbl> 4.255500e+04, 3.672580e-01, 9.560600e+04, 1.181566e+00, 1.44…
## $ `2007`    <dbl> 4.272900e+04, 4.080490e-01, 9.678700e+04, 1.227711e+00, 1.49…
## $ `2008`    <dbl> 4.290600e+04, 4.133830e-01, 9.799600e+04, 1.241397e+00, 1.55…
## $ `2009`    <dbl> 4.307900e+04, 4.023963e-01, 9.921200e+04, 1.233231e+00, 1.61…
## $ `2010`    <dbl> 4.320600e+04, 2.943735e-01, 1.003410e+05, 1.131541e+00, 1.68…
## $ `2011`    <dbl> 4.349300e+04, 6.620631e-01, 1.012880e+05, 9.393559e-01, 1.75…
## $ `2012`    <dbl> 4.386400e+04, 8.493932e-01, 1.021120e+05, 8.102306e-01, 1.82…
## $ `2013`    <dbl> 4.422800e+04, 8.264135e-01, 1.028800e+05, 7.493010e-01, 1.90…
## $ `2014`    <dbl> 4.458800e+04, 8.106692e-01, 1.035940e+05, 6.916153e-01, 1.98…
## $ `2015`    <dbl> 4.494300e+04, 7.930256e-01, 1.042570e+05, 6.379592e-01, 2.06…
## $ `2016`    <dbl> 4.529700e+04, 7.845785e-01, 1.048740e+05, 5.900625e-01, 2.15…
## $ `2017`    <dbl> 4.564800e+04, 7.718989e-01, 1.054390e+05, 5.372957e-01, 2.23…
wbp1<- world_bank_pop %>%
  pivot_longer('2000':'2017', names_to = "year", values_to = "sp")%>%
  print()
## # A tibble: 19,152 × 4
##    country indicator   year     sp
##    <chr>   <chr>       <chr> <dbl>
##  1 ABW     SP.URB.TOTL 2000  41625
##  2 ABW     SP.URB.TOTL 2001  42025
##  3 ABW     SP.URB.TOTL 2002  42194
##  4 ABW     SP.URB.TOTL 2003  42277
##  5 ABW     SP.URB.TOTL 2004  42317
##  6 ABW     SP.URB.TOTL 2005  42399
##  7 ABW     SP.URB.TOTL 2006  42555
##  8 ABW     SP.URB.TOTL 2007  42729
##  9 ABW     SP.URB.TOTL 2008  42906
## 10 ABW     SP.URB.TOTL 2009  43079
## # ℹ 19,142 more rows
wbp2<- wbp1%>%
  pivot_wider(names_from = "indicator", values_from = "sp")%>%
  print(n=10)
## # A tibble: 4,788 × 6
##    country year  SP.URB.TOTL SP.URB.GROW SP.POP.TOTL SP.POP.GROW
##    <chr>   <chr>       <dbl>       <dbl>       <dbl>       <dbl>
##  1 ABW     2000        41625      1.66         89101       2.54 
##  2 ABW     2001        42025      0.956        90691       1.77 
##  3 ABW     2002        42194      0.401        91781       1.19 
##  4 ABW     2003        42277      0.197        92701       0.997
##  5 ABW     2004        42317      0.0946       93540       0.901
##  6 ABW     2005        42399      0.194        94483       1.00 
##  7 ABW     2006        42555      0.367        95606       1.18 
##  8 ABW     2007        42729      0.408        96787       1.23 
##  9 ABW     2008        42906      0.413        97996       1.24 
## 10 ABW     2009        43079      0.402        99212       1.23 
## # ℹ 4,778 more rows

Answer:

country: Three letter country code
year: Value for each year
SP.POP.GROW: population growth
SP.POP.TOTL: total population
SP.URB.GROW: urban population growth
SP.URB.TOTL: total urban population

  1. Replace the country column of the tided data set in step a) with full names of the country (for example, replace USA with United States of America) by checking the data frame who, which contains the full name of each country corresponding to the three-digit country code. Give your code and show the updated data set in a manner to illustrate that the task is correctly fulfilled.
# Enter code here.
glimpse(who)
## Rows: 7,240
## Columns: 60
## $ country      <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan…
## $ iso2         <chr> "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF", "AF…
## $ iso3         <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "…
## $ year         <dbl> 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 198…
## $ new_sp_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sp_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_sn_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ new_ep_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_m65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f014  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f1524 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f2534 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f3544 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f4554 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f5564 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ newrel_f65   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
glimpse(wbp2)
## Rows: 4,788
## Columns: 6
## $ country     <chr> "ABW", "ABW", "ABW", "ABW", "ABW", "ABW", "ABW", "ABW", "A…
## $ year        <chr> "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2…
## $ SP.URB.TOTL <dbl> 41625, 42025, 42194, 42277, 42317, 42399, 42555, 42729, 42…
## $ SP.URB.GROW <dbl> 1.66422212, 0.95637310, 0.40133515, 0.19651721, 0.09456936…
## $ SP.POP.TOTL <dbl> 89101, 90691, 91781, 92701, 93540, 94483, 95606, 96787, 97…
## $ SP.POP.GROW <dbl> 2.5392344, 1.7687566, 1.1947181, 0.9973955, 0.9009892, 1.0…
wbp3<-wbp2 %>%
  left_join(who, c("country" = "iso3"),relationship = "many-to-many")%>%
  dplyr :: select('country.y', 'year.x', 'SP.POP.GROW','SP.POP.TOTL','SP.URB.GROW','SP.URB.TOTL')%>%
  rename(country= country.y, year = year.x)%>%
  print()
## # A tibble: 126,594 × 6
##    country year  SP.POP.GROW SP.POP.TOTL SP.URB.GROW SP.URB.TOTL
##    <chr>   <chr>       <dbl>       <dbl>       <dbl>       <dbl>
##  1 Aruba   2000         2.54       89101        1.66       41625
##  2 Aruba   2000         2.54       89101        1.66       41625
##  3 Aruba   2000         2.54       89101        1.66       41625
##  4 Aruba   2000         2.54       89101        1.66       41625
##  5 Aruba   2000         2.54       89101        1.66       41625
##  6 Aruba   2000         2.54       89101        1.66       41625
##  7 Aruba   2000         2.54       89101        1.66       41625
##  8 Aruba   2000         2.54       89101        1.66       41625
##  9 Aruba   2000         2.54       89101        1.66       41625
## 10 Aruba   2000         2.54       89101        1.66       41625
## # ℹ 126,584 more rows


  1. With the data set obtained in step b), answer which countries had undergone significant urbanization between 2000 and 2017. You need to show the code and the results (either graphs or tables) to support your answer.
# Enter code here.

wbp4<-wbp3%>%
  group_by(country)%>%
  summarise(avg_urb_growth_from_2000_2017 = mean(SP.URB.GROW))%>%
  arrange(desc(avg_urb_growth_from_2000_2017))%>%
  top_n(40, avg_urb_growth_from_2000_2017) %>%
  print(n=10)
## # A tibble: 40 × 2
##    country                     avg_urb_growth_from_2000_2017
##    <chr>                                               <dbl>
##  1 Qatar                                                8.43
##  2 Equatorial Guinea                                    6.72
##  3 United Arab Emirates                                 6.40
##  4 Burkina Faso                                         5.88
##  5 Burundi                                              5.83
##  6 Uganda                                               5.68
##  7 Mali                                                 5.44
##  8 United Republic of Tanzania                          5.16
##  9 Angola                                               5.14
## 10 Maldives                                             5.01
## # ℹ 30 more rows
wbp4%>%
  filter(avg_urb_growth_from_2000_2017>=5)
ggplot(wbp4)+
  stat_summary(mapping = aes(x = country, y = avg_urb_growth_from_2000_2017), geom = "bar")+
  coord_flip()+
  labs(title = "Average urban growth from 2000-2017 accross different country", x = "Country", y = "Average urban growth from 2000-2017")+
  theme(plot.title = element_text(hjust = 0.5))

Answer:

Based on the graph, most of the country has the average urban population growth between 2000 anf 2017 below 5%

Only Qatar, Equatorial Guinea, United Arab Emirates, Burkina Faso, Burundi, Uganda, Mali, United Republic of Tanzania Angola, Maldives have the average urban population growth between 2000 and 2017 above 5%

Qatar, Equatorial Guinea and United Arab Emirates are the top countries that had undergone significant urbanization between 2000 and 2017 because their average growth are much higher compare to the others country

2. Factors and Relational Data

For the following tasks, use data set planes and flights from the nycflights13 package.

  1. For the planes data set, only keep planes from manufacturers that have more than 10 samples in the data set. Then convert manufacturer column into a factor. Then combine AIRBUS and AIRBUS INDUSTRIE as a single category AIRBUS; combine MCDONNELL DOUGLAS, MCDONNELL DOUGLAS AIRCRAFT CO and MCDONNELL DOUGLAS CORPORATION into a single category MCDONNELL. Save your data frame as a new one. Show your code and the first 10 rows of the updated data frame.
# Enter code here.

pm<- planes%>%
  group_by(manufacturer)%>%
  summarise(n= n())%>%
  filter(n>10)

p1<- planes%>%
  inner_join(pm,by="manufacturer")

p2<-p1$manufacturer<-factor(p1$manufacturer)

print(p1)
## # A tibble: 3,270 × 10
##    tailnum  year type        manufacturer model engines seats speed engine     n
##    <chr>   <int> <chr>       <fct>        <chr>   <int> <int> <int> <chr>  <int>
##  1 N10156   2004 Fixed wing… EMBRAER      EMB-…       2    55    NA Turbo…   299
##  2 N102UW   1998 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
##  3 N103US   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
##  4 N104UW   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
##  5 N10575   2002 Fixed wing… EMBRAER      EMB-…       2    55    NA Turbo…   299
##  6 N105UW   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
##  7 N107US   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
##  8 N108UW   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
##  9 N109UW   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
## 10 N110UW   1999 Fixed wing… AIRBUS INDU… A320…       2   182    NA Turbo…   400
## # ℹ 3,260 more rows
p3<-p1 %>%
  mutate(manufacturer = fct_collapse(manufacturer,
    "AIRBUS" = c("AIRBUS", "AIRBUS INDUSTRIE"),
    "MCDONNELL" = c("MCDONNELL DOUGLAS", "MCDONNELL DOUGLAS AIRCRAFT CO","MCDONNELL DOUGLAS CORPORATION")
  ))%>%
  print(n=10)
## # A tibble: 3,270 × 10
##    tailnum  year type        manufacturer model engines seats speed engine     n
##    <chr>   <int> <chr>       <fct>        <chr>   <int> <int> <int> <chr>  <int>
##  1 N10156   2004 Fixed wing… EMBRAER      EMB-…       2    55    NA Turbo…   299
##  2 N102UW   1998 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
##  3 N103US   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
##  4 N104UW   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
##  5 N10575   2002 Fixed wing… EMBRAER      EMB-…       2    55    NA Turbo…   299
##  6 N105UW   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
##  7 N107US   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
##  8 N108UW   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
##  9 N109UW   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
## 10 N110UW   1999 Fixed wing… AIRBUS       A320…       2   182    NA Turbo…   400
## # ℹ 3,260 more rows


  1. Join the flights data set with the planes data set, study how plane models correlate with the flight distance with proper data visualizations or summary tables. You are required to summarize your findings concisely in your own words.
# Enter code here.

glimpse(planes)
## Rows: 3,322
## Columns: 9
## $ tailnum      <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
## $ year         <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
## $ type         <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
## $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
## $ model        <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
## $ engines      <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ seats        <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
## $ speed        <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ engine       <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
ft<-flights%>%
  left_join(planes, by="tailnum")%>%
  group_by(model)%>%
  summarise(avg_dis = mean(distance))%>%
  arrange(desc(avg_dis))%>%
  top_n(40, avg_dis) %>%
  print()
## # A tibble: 40 × 2
##    model     avg_dis
##    <chr>       <dbl>
##  1 A330-243    4983 
##  2 767-424ER   3850.
##  3 A319-115    2525.
##  4 777-222     2520 
##  5 757-212     2475 
##  6 A330-323    2422 
##  7 737-890     2402 
##  8 737-8FH     2402 
##  9 737-990     2402 
## 10 737-990ER   2402 
## # ℹ 30 more rows
ggplot(ft)+
  stat_summary(mapping = aes(x = model, y = avg_dis), geom = "bar")+
  coord_flip()+
  labs(title = "Average distance accross different model", x = "Model", y = "Average Distance (miles)")+
  theme(plot.title = element_text(hjust = 0.5))

Answer:

For the top 40 model with the longest travel distance, ost of the model can fly with the average distance from 1500-2500 miles.

There are afew of them can fly further which is :
777-222: 2520 miles
A319-115: 2525 miles
767-424ER: 3850 miles
A330-243: 4983 miles

3. Datetime and Data Transformation

For the following tasks, use the data set weather, flights or planes from the nycflights13 package.

  1. Create a plot of the temperature change across the whole year of 2013 at the JFK airport. (Hint: You need to first create a datetime variable for each hour.)
# Enter code here.
ftemp<-flights %>% 
  dplyr::select(year, month, day, hour, minute,tailnum, origin) %>% 
  mutate(departure_date = make_date(year, month, day)) %>%
  mutate(departure_scheduled = make_datetime(year, month, day, hour, minute, tz = Sys.timezone()))%>%
  filter(year ==2013, origin == "JFK")%>%
  left_join(weather)%>%
  filter(!is.na(temp))%>%
  group_by(month)%>%
  summarise(avg_temp = mean(temp))%>%
  print()
## # A tibble: 12 × 2
##    month avg_temp
##    <int>    <dbl>
##  1     1     36.3
##  2     2     35.2
##  3     3     40.8
##  4     4     51.9
##  5     5     61.1
##  6     6     71.6
##  7     7     80.2
##  8     8     75.3
##  9     9     68.8
## 10    10     61.2
## 11    11     46.0
## 12    12     39.5
ggplot(ftemp)+
  stat_summary(mapping = aes(x = as.factor(month), y = avg_temp), geom = "bar")+
  labs(title = "Temperature change across the whole year of 2013 at the `JFK` airport", x = "Month", y = "Temperature in F")+
  theme(plot.title = element_text(hjust = 0.5))


  1. Find out which day of the year has the largest temperature difference (defined as the difference between the highest and the lowest temperature) across the day (0am - 11pm).
# Enter code here.

ftemp1<-flights %>% 
  dplyr::select(year, month, day, hour, minute,tailnum, origin) %>% 
  mutate(departure_date = make_date(year, month, day)) %>%
  mutate(departure_scheduled = make_datetime(year, month, day, hour, minute, tz = Sys.timezone()))%>%
  filter(year ==2013, origin == "JFK")%>%
  left_join(weather)%>%
  filter(!is.na(temp))%>%
  group_by(month,day)%>%
  summarise(maxtemp= max(temp), mintemp=min(temp), d =maxtemp-mintemp)%>%
  arrange(desc(d))%>%
  print()
## # A tibble: 364 × 5
## # Groups:   month [12]
##    month   day maxtemp mintemp     d
##    <int> <int>   <dbl>   <dbl> <dbl>
##  1     5     8    62.6    13.1  49.5
##  2     4     9    82.9    48.0  34.9
##  3     9    24    73.9    48.0  25.9
##  4     1    31    55.4    30.0  25.4
##  5    11    27    60.8    36.0  24.8
##  6     1    20    55.0    30.9  24.1
##  7     5    24    69.8    46.0  23.8
##  8     4     1    62.1    39.0  23.0
##  9     4     5    63.0    39.9  23.0
## 10    10     2    84.0    61.0  23.0
## # ℹ 354 more rows

Answer: May 8th has the largest different

  1. Find a way to select all overnight flights (also called “Red Eye Flights” that depart at late night and arrive in the early morning) from the flights data set. Here overnight flights are defined as flights that departed between 10pm and 1am, and having an air time of over 4 hours . Create a categorical variable overnight_flag with YES or NO as the possible values. Show your code and the updated data frame.
# Enter code here.
ovnight<- flights %>% 
  dplyr::select(year, month, day, hour, minute,tailnum, origin,air_time) %>% 
  mutate(departure_date = make_date(year, month, day)) %>%
  mutate(departure_scheduled = make_datetime(year, month, day, hour, minute, tz = Sys.timezone()))%>%
  filter(!is.na(air_time))%>%
  mutate(overnight_flag = ifelse(
    hour %in% c(22, 23, 0, 1)& air_time> 4, "YES","NO"))

glimpse(ovnight)
## Rows: 327,346
## Columns: 11
## $ year                <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20…
## $ month               <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ day                 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hour                <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6,…
## $ minute              <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tailnum             <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", …
## $ origin              <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "…
## $ air_time            <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 1…
## $ departure_date      <date> 2013-01-01, 2013-01-01, 2013-01-01, 2013-01-01, 2…
## $ departure_scheduled <dttm> 2013-01-01 05:15:00, 2013-01-01 05:29:00, 2013-01…
## $ overnight_flag      <chr> "NO", "NO", "NO", "NO", "NO", "NO", "NO", "NO", "N…

Answer:

  1. Someone says that most overnight flights use relatively small planes. Verify whether this is true with the data frame obtained in c) and the planes data set.
# Enter code here.

ovnight1<-ovnight%>%
  left_join(planes,by="tailnum")%>%
  filter(!is.na(seats))%>%
  group_by(overnight_flag)%>%
  summarise(avg_seats=mean(seats))%>%
  print()
## # A tibble: 2 × 2
##   overnight_flag avg_seats
##   <chr>              <dbl>
## 1 NO                  138.
## 2 YES                 106.

Answer: This is true because on average overnight flights use relatively small plane

4. General Analysis and Statistical Tests

Answer the following questions with data visualization or summary. You are required to summarize your findings concisely in your own words and support your conclusion with proper graphs or tables.

  1. From the gss_cat data set, find factors that are significantly correlated with the reported income.
# Enter code here.
glimpse(gss_cat)
## Rows: 21,483
## Columns: 9
## $ year    <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 20…
## $ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, Mar…
## $ age     <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52, 40…
## $ race    <fct> White, White, White, White, White, White, White, White, White,…
## $ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicable, …
## $ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,near…
## $ relig   <fct> Protestant, Protestant, Protestant, Orthodox-christian, None, …
## $ denom   <fct> "Southern baptist", "Baptist-dk which", "No denomination", "No…
## $ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3, 3…
gss_cat1<-gss_cat%>%
filter(!is.na(rincome))

#age
oneway.test(gss_cat1$age ~ gss_cat1$rincome)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  gss_cat1$age and gss_cat1$rincome
## F = 190.67, num df = 15.0, denom df = 1934.6, p-value < 2.2e-16
#year
oneway.test(gss_cat1$year ~ gss_cat1$rincome)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  gss_cat1$year and gss_cat1$rincome
## F = 14.516, num df = 15.0, denom df = 1972.8, p-value < 2.2e-16
#marital
chisq.test(table(gss_cat1$marital, gss_cat1$rincome))
## Warning in chisq.test(table(gss_cat1$marital, gss_cat1$rincome)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(gss_cat1$marital, gss_cat1$rincome)
## X-squared = 2470.6, df = 75, p-value < 2.2e-16
#race
gss_cat4<- gss_cat1%>%
  filter(!is.na(race))
chisq.test(table(gss_cat4$race,gss_cat4$rincome))
## Warning in chisq.test(table(gss_cat4$race, gss_cat4$rincome)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(gss_cat4$race, gss_cat4$rincome)
## X-squared = NaN, df = 45, p-value = NA
#partyid
chisq.test(table(gss_cat1$partyid, gss_cat1$rincome))
## Warning in chisq.test(table(gss_cat1$partyid, gss_cat1$rincome)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(gss_cat1$partyid, gss_cat1$rincome)
## X-squared = 806.11, df = 135, p-value < 2.2e-16
#relig
gss_cat3<- gss_cat1%>%
  filter(!is.na(relig))
chisq.test(table(gss_cat3$relig, gss_cat3$rincome))
## Warning in chisq.test(table(gss_cat3$relig, gss_cat3$rincome)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(gss_cat3$relig, gss_cat3$rincome)
## X-squared = NaN, df = 225, p-value = NA
#denom
chisq.test(table(gss_cat1$denom, gss_cat1$rincome))
## Warning in chisq.test(table(gss_cat1$denom, gss_cat1$rincome)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(gss_cat1$denom, gss_cat1$rincome)
## X-squared = 951.2, df = 435, p-value < 2.2e-16
#tvhours
gss_cat2<- gss_cat1%>%
  filter(!is.na(tvhours))
oneway.test(gss_cat2$tvhours ~ gss_cat2$rincome)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  gss_cat2$tvhours and gss_cat2$rincome
## F = 53.509, num df = 15, denom df = 1004, p-value < 2.2e-16

Answer: (please dont count this answer) Since the p-value is very small, year of survey, age, marital status, party affiliation, denomination, and tvhours are significantly correlated with the reported income.


  1. From the smoking data set of the openintro package, find factors that are significantly correlated with the smoking status and the number of cigarettes smoked per day.
# Enter code here.
glimpse(smoking)
## Rows: 1,691
## Columns: 12
## $ gender                <fct> Male, Female, Male, Female, Female, Female, Male…
## $ age                   <int> 38, 42, 40, 40, 39, 37, 53, 44, 40, 41, 72, 49, …
## $ marital_status        <fct> Divorced, Single, Married, Married, Married, Mar…
## $ highest_qualification <fct> No Qualification, No Qualification, Degree, Degr…
## $ nationality           <fct> British, British, English, English, British, Bri…
## $ ethnicity             <fct> White, White, White, White, White, White, White,…
## $ gross_income          <fct> "2,600 to 5,200", "Under 2,600", "28,600 to 36,4…
## $ region                <fct> The North, The North, The North, The North, The …
## $ smoke                 <fct> No, Yes, No, No, No, No, Yes, No, Yes, Yes, No, …
## $ amt_weekends          <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 15, NA, NA, NA…
## $ amt_weekdays          <int> NA, 12, NA, NA, NA, NA, 6, NA, 8, 12, NA, NA, NA…
## $ type                  <fct> , Packets, , , , , Packets, , Hand-Rolled, Packe…
sm<-smoking%>%
  filter(!is.na(smoke)|!is.na(ethnicity)|!is.na(amt_weekends)&!is.na(amt_weekdays))%>%
  mutate(amt_day = (amt_weekends+amt_weekdays)/2)%>%
  print()
## # A tibble: 1,691 × 13
##    gender   age marital_status highest_qualification nationality ethnicity
##    <fct>  <int> <fct>          <fct>                 <fct>       <fct>    
##  1 Male      38 Divorced       No Qualification      British     White    
##  2 Female    42 Single         No Qualification      British     White    
##  3 Male      40 Married        Degree                English     White    
##  4 Female    40 Married        Degree                English     White    
##  5 Female    39 Married        GCSE/O Level          British     White    
##  6 Female    37 Married        GCSE/O Level          British     White    
##  7 Male      53 Married        Degree                British     White    
##  8 Male      44 Single         Degree                English     White    
##  9 Male      40 Single         GCSE/CSE              English     White    
## 10 Female    41 Married        No Qualification      English     White    
## # ℹ 1,681 more rows
## # ℹ 7 more variables: gross_income <fct>, region <fct>, smoke <fct>,
## #   amt_weekends <int>, amt_weekdays <int>, type <fct>, amt_day <dbl>
#highest_qualification
chisq.test(table(sm$highest_qualification, sm$smoke))
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$highest_qualification, sm$smoke)
## X-squared = 40.281, df = 7, p-value = 1.112e-06
oneway.test(sm$amt_day~ sm$highest_qualification)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$amt_day and sm$highest_qualification
## F = 2.2631, num df = 7.00, denom df = 104.83, p-value = 0.03476
#region
chisq.test(table(sm$region, sm$smoke))
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$region, sm$smoke)
## X-squared = 12.671, df = 6, p-value = 0.04857
oneway.test(sm$amt_day~ sm$region)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$amt_day and sm$region
## F = 2.2964, num df = 6.00, denom df = 129.55, p-value = 0.03861
#gender
chisq.test(table(sm$gender, sm$smoke))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(sm$gender, sm$smoke)
## X-squared = 0.42699, df = 1, p-value = 0.5135
oneway.test(sm$amt_day~ sm$gender)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$amt_day and sm$gender
## F = 14.816, num df = 1.00, denom df = 325.74, p-value = 0.0001427
#age
oneway.test(sm$age~ sm$smoke)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$age and sm$smoke
## F = 99.444, num df = 1.0, denom df = 831.2, p-value < 2.2e-16
cor(sm$age, sm$amt_day)
## [1] NA
#marital_status
chisq.test(table(sm$marital_status, sm$smoke))
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$marital_status, sm$smoke)
## X-squared = 74.98, df = 4, p-value = 2.012e-15
oneway.test(sm$amt_day~ sm$marital_status)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$amt_day and sm$marital_status
## F = 0.74965, num df = 4.000, denom df = 98.133, p-value = 0.5606
#nationality
chisq.test(table(sm$nationality, sm$smoke))
## Warning in chisq.test(table(sm$nationality, sm$smoke)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$nationality, sm$smoke)
## X-squared = 14.929, df = 7, p-value = 0.03692
#oneway.test(sm$amt_day~ sm$nationality) not enough observation

#ethnicity
chisq.test(table(sm$ethnicity, sm$smoke))
## Warning in chisq.test(table(sm$ethnicity, sm$smoke)): Chi-squared approximation
## may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$ethnicity, sm$smoke)
## X-squared = 3.0561, df = 6, p-value = 0.8018
#oneway.test(sm$amt_day~ sm$ethnicity)not enough observation

#gross_income
chisq.test(table(sm$gross_income, sm$smoke))
## Warning in chisq.test(table(sm$gross_income, sm$smoke)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$gross_income, sm$smoke)
## X-squared = 19.835, df = 9, p-value = 0.01896
oneway.test(sm$amt_day~ sm$gross_income)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$amt_day and sm$gross_income
## F = 1.1923, num df = 9.000, denom df = 40.541, p-value = 0.3261
#type
chisq.test(table(sm$type, sm$smoke))
## Warning in chisq.test(table(sm$type, sm$smoke)): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(sm$type, sm$smoke)
## X-squared = 1691, df = 4, p-value < 2.2e-16
oneway.test(sm$amt_day~ sm$type)
## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  sm$amt_day and sm$type
## F = 2.018, num df = 3.000, denom df = 36.822, p-value = 0.1283

Answer: (Please dont count this answer) Highest education level and Region significantly correlated with the smoking status and the number of cigarettes smoked per day