x=c(NA,3,14,NA,33,17,NA,41)
sum(is.na(x))
## [1] 3
which(is.na(x))
## [1] 1 4 7
sapply(x,function(x)sum(is.na(x)))
## [1] 1 0 0 1 0 0 1 0
SinNA=na.omit(x)
head(SinNA,n=10)
## [1] 3 14 33 17 41
df_1=data.frame (Name = c(NA, "Joseph", "Martin", NA, "Andrea"),Sales = c(15, 18, 21, 56, 60), Price = c(34, 52, 21, 44, 20), stringsAsFactors = FALSE)
print(df_1)
## Name Sales Price
## 1 <NA> 15 34
## 2 Joseph 18 52
## 3 Martin 21 21
## 4 <NA> 56 44
## 5 Andrea 60 20
sapply(df_1, function(x) sum(is.na(x)))
## Name Sales Price
## 2 0 0
df_2=df_1[!is.na(df_1$Name),]
head(df_2)
## Name Sales Price
## 2 Joseph 18 52
## 3 Martin 21 21
## 5 Andrea 60 20
flights %>%
filter(carrier=="UA") %>%
filter(dep_delay>=480)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 26 2345 1542 483 104 1729
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(carrier=="UA") %>%
filter(dep_delay>160)
## # A tibble: 763 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 2 1412 838 334 1710 1147
## 2 2013 1 2 2131 1512 379 2340 1741
## 3 2013 1 3 1503 1221 162 1803 1555
## 4 2013 1 4 2221 1858 203 116 2240
## 5 2013 1 5 1930 1545 225 2139 1806
## 6 2013 1 6 1242 920 202 1527 1233
## 7 2013 1 7 1323 830 293 1604 1154
## 8 2013 1 9 2223 1810 253 111 2104
## 9 2013 1 10 1525 900 385 1713 1039
## 10 2013 1 10 2137 1630 307 17 1925
## # … with 753 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
vuelos_retrasados=filter(flights,arr_delay>120)
nrow(vuelos_retrasados)
## [1] 10034
flights %>%
group_by(dest) %>%
filter(dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 x 19
## # Groups: dest [2]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # … with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(carrier=="UA" | carrier=="AA" | carrier=="DL")
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 600 -1 941 910
## 10 2013 1 1 559 600 -1 854 902
## # … with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(month == 07 | month == 08 | month == 09)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(arr_delay>120) %>%
filter(dep_delay==0)
## # A tibble: 3 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 7 1350 1350 0 1736 1526
## 2 2013 5 23 1810 1810 0 2208 2000
## 3 2013 7 1 905 905 0 1443 1223
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(dep_delay>=60) %>%
filter(arr_delay==30)
## # A tibble: 33 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 5 1605 1500 65 1857 1827
## 2 2013 10 9 2056 1955 61 2217 2147
## 3 2013 10 11 2101 2000 61 2202 2132
## 4 2013 10 15 2008 1900 68 2131 2101
## 5 2013 11 6 2208 2100 68 2309 2239
## 6 2013 11 9 2021 1915 66 2203 2133
## 7 2013 12 27 1709 1605 64 1906 1836
## 8 2013 12 28 1953 1845 68 2250 2220
## 9 2013 2 16 1741 1630 71 1917 1847
## 10 2013 3 3 1719 1605 74 1903 1833
## # … with 23 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
top_n(air_time)
## Selecting by time_hour
## Warning in if (n > 0) {: the condition has length > 1 and only the first element
## will be used
## # A tibble: 191 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 31 13 2359 14 439 437
## 2 2013 12 31 18 2359 19 449 444
## 3 2013 12 31 26 2245 101 129 2353
## 4 2013 12 31 922 930 -8 1537 1535
## 5 2013 12 31 935 930 5 1528 1530
## 6 2013 12 31 1300 1300 0 1640 1633
## 7 2013 12 31 1306 1300 6 1634 1610
## 8 2013 12 31 1325 1325 0 1637 1648
## 9 2013 12 31 1345 1345 0 1658 1705
## 10 2013 12 31 1355 1350 5 1733 1730
## # … with 181 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
top_n(10,arr_delay)
## # A tibble: 10 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 1 10 1121 1635 1126 1239 1810
## 3 2013 12 5 756 1700 896 1058 2020
## 4 2013 3 17 2321 810 911 135 1020
## 5 2013 4 10 1100 1900 960 1342 2211
## 6 2013 5 3 1133 2055 878 1250 2215
## 7 2013 6 15 1432 1935 1137 1607 2120
## 8 2013 7 22 845 1600 1005 1044 1815
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 9 20 1139 1845 1014 1457 2210
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
arrange(origin,dest,desc(distance))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1315 1317 -2 1413 1423
## 2 2013 1 1 1655 1621 34 1804 1724
## 3 2013 1 1 2056 2004 52 2156 2112
## 4 2013 1 2 1332 1327 5 1419 1433
## 5 2013 1 2 1746 1621 85 1835 1724
## 6 2013 1 2 2148 2004 104 2234 2112
## 7 2013 1 3 1716 1619 57 1803 1723
## 8 2013 1 3 2031 2038 -7 2131 2139
## 9 2013 1 4 1618 1619 -1 1714 1723
## 10 2013 1 4 2031 2000 31 2131 2101
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
arrange(desc(arr_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 7 22 2257 759 898 121 1026
## 9 2013 12 5 756 1700 896 1058 2020
## 10 2013 5 3 1133 2055 878 1250 2215
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
arrange(desc(arr_delay)) %>%
arrange(-(is.na(dep_delay)))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
select(dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
F2 = flights %>% select(dep_time, dep_delay, arr_time, arr_delay, dep_time, dep_delay, arr_time, arr_delay)
print (F2)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
flights%>%select(ends_with("Y"),contains("Time"),starts_with("A"))
## # A tibble: 336,776 x 9
## day dep_delay arr_delay dep_time sched_dep_time arr_time sched_arr_time
## <int> <dbl> <dbl> <int> <int> <int> <int>
## 1 1 2 11 517 515 830 819
## 2 1 4 20 533 529 850 830
## 3 1 2 33 542 540 923 850
## 4 1 -1 -18 544 545 1004 1022
## 5 1 -6 -25 554 600 812 837
## 6 1 -4 12 554 558 740 728
## 7 1 -5 19 555 600 913 854
## 8 1 -3 -14 557 600 709 723
## 9 1 -3 -8 557 600 838 846
## 10 1 -2 8 558 600 753 745
## # … with 336,766 more rows, and 2 more variables: air_time <dbl>,
## # time_hour <dttm>
flights %>%
select(air_time, arr_time, dep_time)
## # A tibble: 336,776 x 3
## air_time arr_time dep_time
## <dbl> <int> <int>
## 1 227 830 517
## 2 227 850 533
## 3 160 923 542
## 4 183 1004 544
## 5 116 812 554
## 6 150 740 554
## 7 158 913 555
## 8 53 709 557
## 9 140 838 557
## 10 138 753 558
## # … with 336,766 more rows
flights %>%
mutate(dep_time*60, sched_dep_time*60, dep_delay*60)
## # A tibble: 336,776 x 22
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 14 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # `dep_time * 60` <dbl>, `sched_dep_time * 60` <dbl>, `dep_delay * 60` <dbl>
flights %>% group_by(year, month, day) %>% summarise(delay= mean(dep_delay))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 NA
## 2 2013 1 2 NA
## 3 2013 1 3 NA
## 4 2013 1 4 NA
## 5 2013 1 5 NA
## 6 2013 1 6 NA
## 7 2013 1 7 NA
## 8 2013 1 8 NA
## 9 2013 1 9 NA
## 10 2013 1 10 NA
## # … with 355 more rows
flights%>% group_by(year) %>% summarise(max_distance=max(distance, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 1 x 2
## year max_distance
## <int> <dbl>
## 1 2013 4983
flights %>% group_by(year, month, day) %>% summarise(frist_dep=first(dep_time), last_dep=last(dep_time))
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
## # A tibble: 365 x 5
## # Groups: year, month [12]
## year month day frist_dep last_dep
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 NA
## 2 2013 1 2 42 NA
## 3 2013 1 3 32 NA
## 4 2013 1 4 25 NA
## 5 2013 1 5 14 NA
## 6 2013 1 6 16 NA
## 7 2013 1 7 49 NA
## 8 2013 1 8 454 NA
## 9 2013 1 9 2 NA
## 10 2013 1 10 3 NA
## # … with 355 more rows
nrow(flights)
## [1] 336776