Assignment 1

library(nycflights13)
data(flights)
?flights
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame':    336776 obs. of  19 variables:
##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr  "UA" "UA" "AA" "B6" ...
##  $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr  "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr  "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr  "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num  1400 1416 1089 1576 762 ...
##  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

Cau 1

Cau 2

library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
flights_mis<- aggr(flights, col=c('navyblue','yellow'),numbers=TRUE, sortVars=TRUE,labels=names(flights), cex.axis=.7,gap=3)

## 
##  Variables sorted by number of missings: 
##        Variable       Count
##       arr_delay 0.028000808
##        air_time 0.028000808
##        arr_time 0.025871796
##        dep_time 0.024511842
##       dep_delay 0.024511842
##         tailnum 0.007458964
##            year 0.000000000
##           month 0.000000000
##             day 0.000000000
##  sched_dep_time 0.000000000
##  sched_arr_time 0.000000000
##         carrier 0.000000000
##          flight 0.000000000
##          origin 0.000000000
##            dest 0.000000000
##        distance 0.000000000
##            hour 0.000000000
##          minute 0.000000000
##       time_hour 0.000000000
# Ket qua: So du lieu khong thieu la 97.2%; Co 13 bien khong thieu du lieu va 6 bien thieu du lieu

Cau 3

df1 <-  subset(flights, year == 2013 & month == 1 & day == 1)
dim(df1)
## [1] 842  19
# Nhu vay co 842 chuyen bay vao ngay 1/1/2013

Cau 4

# So chuyen bay tre gio khoi hanh

df2 <- subset(flights, dep_delay > 0)
dim(df2)
## [1] 128432     19
# Vay la co 128432 chuyen bay tre gioi khoi hanh
# Ty le
128432/336776 
## [1] 0.3813573
#  0.38%

Cau 5.

# Danh sach 10 chuyen bay tre gio nhat
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## between():   dplyr, data.table
## filter():    dplyr, stats
## first():     dplyr, data.table
## lag():       dplyr, stats
## last():      dplyr, data.table
## transpose(): purrr, data.table
df3 <- arrange(df2, desc(dep_delay))
head(df3)
## # A tibble: 6 <U+00D7> 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      641            900      1301     1242
## 2  2013     6    15     1432           1935      1137     1607
## 3  2013     1    10     1121           1635      1126     1239
## 4  2013     9    20     1139           1845      1014     1457
## 5  2013     7    22      845           1600      1005     1044
## 6  2013     4    10     1100           1900       960     1342
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
#Hoac
df3 %>% arrange(desc(dep_delay)) %>% head(., 10) # Vay nhung chuyen bay co thoi gian khoi hanh nhieu hown 896 la nhung chuyen bay tre gio nhat
## # A tibble: 10 <U+00D7> 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     9      641            900      1301     1242
## 2   2013     6    15     1432           1935      1137     1607
## 3   2013     1    10     1121           1635      1126     1239
## 4   2013     9    20     1139           1845      1014     1457
## 5   2013     7    22      845           1600      1005     1044
## 6   2013     4    10     1100           1900       960     1342
## 7   2013     3    17     2321            810       911      135
## 8   2013     6    27      959           1900       899     1236
## 9   2013     7    22     2257            759       898      121
## 10  2013    12     5      756           1700       896     1058
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Cau 6

# Loc ra cac chuyen bay xuat phat tu cac san bay khac nhau
df5_1 <- filter (flights, origin == "EWR") # co 120835 chuyen bay xuat phat tu san EWR
df5_2 <- filter (flights, origin == "LGA") # co 104662 chuyen bay xuat phat tu san LAG
df5_3 <- filter (flights, origin == "JFK") # co 111279 chuyen bay xuat phat tu san JFK
120835+104662+111279
## [1] 336776
# Vay san EWR la san co nhieu chuyen bay xuat phat nhat

Cau 10

Thang nao trong nam co nhieu chuyen bay nhat

df6_1 <- filter(flights, month == 1) # 27004 chuyen
df6_2 <- filter(flights, month == 2) # 24951 chuyen
df6_3 <- filter(flights, month == 3) # 28834 chuyen
df6_4 <- filter(flights, month == 4) # 28330 chuyen
df6_5 <- filter(flights, month == 5) # 28796 chuyen
df6_6 <- filter(flights, month == 6) # 28243 chuyen
df6_7 <- filter(flights, month == 7) # 29425 chuyen
df6_8 <- filter(flights, month == 8) # 29327 chuyen
df6_9 <- filter(flights, month == 9) # 27572 chuyen
df6_10 <- filter(flights, month == 10) # 28889 chuyen
df6_11 <- filter(flights, month == 11) # 27262 chuyen
df6_12 <- filter(flights, month == 12) # 28135 chuyen
# Vay thang 7 la thang co nhieu chuyen bay nhat trong nam (29425 chuyen)
# Ve hinh => Khong biet
library(ggplot2)

Cau 11. So chuyen bay co thoi gian khoi hanh tre hon 120 phut

df7 <- filter(df3, dep_delay >= 120)
# Vay la co 9888 chuyen bay khoi hanh tre hon 120 phut

Cau 12. Tim so chuyen bay den san IAH va HOU

df8_1 <- filter(flights, dest == "IAH")
# Co 7918 chuyen bay den san IAH
df8_1
## # A tibble: 7,198 <U+00D7> 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      623            627        -4      933
## 4   2013     1     1      728            732        -4     1041
## 5   2013     1     1      739            739         0     1104
## 6   2013     1     1      908            908         0     1228
## 7   2013     1     1     1028           1026         2     1350
## 8   2013     1     1     1044           1045        -1     1352
## 9   2013     1     1     1114            900       134     1447
## 10  2013     1     1     1205           1200         5     1503
## # ... with 7,188 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
df8_2 <- filter(flights, dest == "HOU")
# Co 2115 chuyen bay den san HOU
df8_2
## # A tibble: 2,115 <U+00D7> 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1     1208           1158        10     1540
## 2   2013     1     1     1306           1300         6     1622
## 3   2013     1     1     1708           1700         8     2037
## 4   2013     1     1     2030           2035        -5     2354
## 5   2013     1     2      734            700        34     1045
## 6   2013     1     2     1156           1158        -2     1517
## 7   2013     1     2     1319           1305        14     1633
## 8   2013     1     2     1810           1655        75     2146
## 9   2013     1     2     2031           2035        -4     2353
## 10  2013     1     3      704            700         4     1036
## # ... with 2,105 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

Cau 13

Tu cac df tao ra o cau 10, thang 7 co 29425 chuyn bay, thang 8 co 29327 chuyen bay, thang 9 co 27547 chuyen bay ## Cau 14

library(stringr)

Cau 15

library(tidyverse)
# Van toc trung binh 
df10 <- mutate(flights, avespeed = distance/air_time)
df10
## # A tibble: 336,776 <U+00D7> 20
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, avespeed <dbl>
# Thoi gian bay tinh theo gio
df11 <-  mutate(flights, airtime_hour = air_time/60)
df11
## # A tibble: 336,776 <U+00D7> 20
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1   2013     1     1      517            515         2      830
## 2   2013     1     1      533            529         4      850
## 3   2013     1     1      542            540         2      923
## 4   2013     1     1      544            545        -1     1004
## 5   2013     1     1      554            600        -6      812
## 6   2013     1     1      554            558        -4      740
## 7   2013     1     1      555            600        -5      913
## 8   2013     1     1      557            600        -3      709
## 9   2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, airtime_hour <dbl>

Cau 16

Chon ra 3 hang co nhieu chuyen bay nhat khoi hanh tu NY

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.