Ch???y b??? d??? li???u và xem thông tin v??? b??? s??? li???u
library(magrittr)
library(ggplot2)
library(ggthemes)
library("nycflights13")
data ("flights")
attach(flights)
?flights
## starting httpd help server ...
## done
View(flights)
dim(flights)
## [1] 336776 19
Có 19 bi???n s??? và 336776 quan sát
Ki???u d??? li???u c???a 19 bi???n s??? dó là:
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 336776 obs. of 19 variables:
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr "UA" "UA" "AA" "B6" ...
## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num 1400 1416 1089 1576 762 ...
## $ hour : num 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
D???m s??? d??? li???u thi???u (na) c???a t???ng bi???n s???
lapply(flights, function(x)sum(is.na(x)))
## $year
## [1] 0
##
## $month
## [1] 0
##
## $day
## [1] 0
##
## $dep_time
## [1] 8255
##
## $sched_dep_time
## [1] 0
##
## $dep_delay
## [1] 8255
##
## $arr_time
## [1] 8713
##
## $sched_arr_time
## [1] 0
##
## $arr_delay
## [1] 9430
##
## $carrier
## [1] 0
##
## $flight
## [1] 0
##
## $tailnum
## [1] 2512
##
## $origin
## [1] 0
##
## $dest
## [1] 0
##
## $air_time
## [1] 9430
##
## $distance
## [1] 0
##
## $hour
## [1] 0
##
## $minute
## [1] 0
##
## $time_hour
## [1] 0
Phân tích hình ???nh cho d??? li???u thi???u
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(flights, col = c("navyblue", "yellow"),
numbers = TRUE, sortVars = TRUE, labels = names(flights),
cex.axis =.7, gap = 3, ylab = c("Missing Data", "Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## arr_delay 0.028000808
## air_time 0.028000808
## arr_time 0.025871796
## dep_time 0.024511842
## dep_delay 0.024511842
## tailnum 0.007458964
## year 0.000000000
## month 0.000000000
## day 0.000000000
## sched_dep_time 0.000000000
## sched_arr_time 0.000000000
## carrier 0.000000000
## flight 0.000000000
## origin 0.000000000
## dest 0.000000000
## distance 0.000000000
## hour 0.000000000
## minute 0.000000000
## time_hour 0.000000000
K???t lu???n: -) S??? d??? li???u không thi???u chi???m 97.2% d??? li???u -) Có 6 bi???n có d??? li???u thi???u ,bao g???m các bi???n arr_delay, air_time, arr_time, dep_time, dep_delay, tailnum -) bi???n s??? thi???u nhi???u nh???t là arr_delay v???i 2.8%, bi???n thi???u ít nh???t là tailnum v???i 0.74%
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## between(): dplyr, data.table
## filter(): dplyr, stats
## first(): dplyr, data.table
## lag(): dplyr, stats
## last(): dplyr, data.table
## transpose(): purrr, data.table
library(magrittr)
flights %>% filter(month == 1 & day == 1) %>% nrow()
## [1] 842
có 842 chuy???n bay kh???i hành vào ngày d???u tiên c???a nam 2013
flights %>% filter(dep_delay > 0) %>% nrow()/nrow(flights) *100
## [1] 38.13573
có 38% chuy???n bay b??? tr??? gi??? kh???i hành
flights %>% filter(dep_delay > 0) %>% arrange(desc(dep_delay)) %>% head(.,10)
## # A tibble: 10 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 9 641 900 1301 1242
## 2 2013 6 15 1432 1935 1137 1607
## 3 2013 1 10 1121 1635 1126 1239
## 4 2013 9 20 1139 1845 1014 1457
## 5 2013 7 22 845 1600 1005 1044
## 6 2013 4 10 1100 1900 960 1342
## 7 2013 3 17 2321 810 911 135
## 8 2013 6 27 959 1900 899 1236
## 9 2013 7 22 2257 759 898 121
## 10 2013 12 5 756 1700 896 1058
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flights %>% group_by(origin) %>% summarise(freq = n())
## # A tibble: 3 × 2
## origin freq
## <chr> <int>
## 1 EWR 120835
## 2 JFK 111279
## 3 LGA 104662
sân bây EWR là noi xu???t phát c???a nhi???u chuy???n bay nh???t
flights %>% group_by(dest) %>%
summarise(freq = n()) %>%
nrow()
## [1] 105
Chuy???n bay t??? NY s??? d???n 105 sân bay khác nhau
flights %>% group_by(dest) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
head(.,5)
## # A tibble: 5 × 2
## dest freq
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
5 sân bay ti???p nh???n nhi???u chuy???n bay nh???t t??? NY là: ORD, ATL, LAX, BOS, MCO
flights %>% group_by(carrier) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
head(.,5)
## # A tibble: 5 × 2
## carrier freq
## <chr> <int>
## 1 UA 58665
## 2 B6 54635
## 3 EV 54173
## 4 DL 48110
## 5 AA 32729
5 hãng hàng không có nhi???u chuy???n bay nh???t kh???i hành t??? NY là: UA, B6, EV, DL, AA
flights %>% group_by(month) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
## # A tibble: 12 × 2
## month freq
## <int> <int>
## 1 7 29425
## 2 8 29327
## 3 10 28889
## 4 3 28834
## 5 5 28796
## 6 4 28330
## 7 6 28243
## 8 12 28135
## 9 9 27574
## 10 11 27268
## 11 1 27004
## 12 2 24951
Tháng 7 là tháng có nhi???u chuy???n bay trong nam nh???t Minh h???a b???ng bar graph
flights %>% group_by(month) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
ggplot(aes(month, freq/1000)) +
geom_bar(stat = "identity", width = 0.5) +
labs(x = "Tháng",
y = NULL,
title = "S??? chuy???n bay theo các tháng trong nam",
subtitle = "Don v???: Nghìn chuy???n",
caption = "Ngu???n: Bureau of Transportation Statistics") +
scale_x_continuous(breaks = seq (0,12,1)) +
theme_economist()
Th???i gian tr??? ??? dây là th???i gian kh???i hành tr??? ho???c th???i gian d???n tr???
flights %>% filter(dep_delay >= 120|arr_delay >=120)
## # A tibble: 11,606 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 811 630 101 1047
## 2 2013 1 1 848 1835 853 1001
## 3 2013 1 1 957 733 144 1056
## 4 2013 1 1 1114 900 134 1447
## 5 2013 1 1 1505 1310 115 1638
## 6 2013 1 1 1525 1340 105 1831
## 7 2013 1 1 1540 1338 122 2020
## 8 2013 1 1 1549 1445 64 1912
## 9 2013 1 1 1558 1359 119 1718
## 10 2013 1 1 1732 1630 62 2028
## # ... with 11,596 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>% filter(dest == "IAH"|dest =="HOU")
## # A tibble: 9,313 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 623 627 -4 933
## 4 2013 1 1 728 732 -4 1041
## 5 2013 1 1 739 739 0 1104
## 6 2013 1 1 908 908 0 1228
## 7 2013 1 1 1028 1026 2 1350
## 8 2013 1 1 1044 1045 -1 1352
## 9 2013 1 1 1114 900 134 1447
## 10 2013 1 1 1205 1200 5 1503
## # ... with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>% filter(month %in% c(7,8,9)) %>% nrow()
## [1] 86326
Có 86326 chuy???n bay trong các tháng hè
flights %>% filter(dep_time %in% c(0:600)) %>% nrow()
## [1] 9344
Có 9344 chuy???n bay c???t cánh trong kho???ng th???i gian 0h d???n 6h
T???o c???t bi???n m???i là v_tb d??? tính v???n t???c trung bình cho m???i chuy???n bay và t???o thêm c???t bi???n air_time_h d??? tính th???i gian bay theo h
df1 <- flights %>% transmute(v_tb = distance/arr_time) %>% round(2)
df2 <- flights %>% transmute(air_time_h = air_time/60) %>% round(2)
(new1 <- bind_cols(flights,df1,df2))
## # A tibble: 336,776 × 21
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 14 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, v_tb <dbl>, air_time_h <dbl>
flights %>% group_by(carrier) %>%
summarise(freq = n()) %>%
arrange(desc(freq)) %>%
head(.,3)
## # A tibble: 3 × 2
## carrier freq
## <chr> <int>
## 1 UA 58665
## 2 B6 54635
## 3 EV 54173
love <- data.frame( for (i in 1:12) {
flights %>% filter(carrier == "UA" & month == i & dep_delay > 0) %>%
summarise(mean(dep_delay, na.rm =TRUE)) %>% round()
})
love
## data frame with 0 columns and 0 rows