This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
install.packages("nycflights13")
## Installing package into '/home/testRstudioagain/R/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/testRstudioagain/R/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.5 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(dplyr)
data(flights)
data(airlines)
data(airports)
data(weather)
force(flights)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
force(airlines)
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
force(airports)
## # A tibble: 1,458 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
## # … with 1,448 more rows
force(weather)
## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>
select(flights,year,month,day,arr_delay,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,carrier)
## # A tibble: 336,776 × 10
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 11 517 515 2 830
## 2 2013 1 1 20 533 529 4 850
## 3 2013 1 1 33 542 540 2 923
## 4 2013 1 1 -18 544 545 -1 1004
## 5 2013 1 1 -25 554 600 -6 812
## 6 2013 1 1 12 554 558 -4 740
## 7 2013 1 1 19 555 600 -5 913
## 8 2013 1 1 -14 557 600 -3 709
## 9 2013 1 1 -8 557 600 -3 838
## 10 2013 1 1 8 558 600 -2 753
## # … with 336,766 more rows, and 2 more variables: sched_arr_time <int>,
## # carrier <chr>
filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, !(arr_delay > 120 ))
## # A tibble: 317,312 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 317,302 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(arr_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 7 22 2257 759 898 121 1026
## 9 2013 12 5 756 1700 896 1058 2020
## 10 2013 5 3 1133 2055 878 1250 2215
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
by_day <- group_by(flights, carrier)
arrdelay<-summarise(by_day, delay = mean(arr_delay, na.rm = TRUE))
arrange(arrdelay, desc(delay))
## # A tibble: 16 × 2
## carrier delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
by_hour <- group_by(flights, hour)
arrivaldelay<-summarise(by_hour, delay = mean(arr_delay, na.rm = TRUE))
arrange(arrivaldelay, desc(delay))
## # A tibble: 20 × 2
## hour delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
ans6<-head(flights,100)%>%select(year, month,day,hour,origin,dest,tailnum,carrier)%>%left_join(airlines)
## Joining, by = "carrier"
ans6%>%left_join(weather)%>%left_join(airports)
## Joining, by = c("year", "month", "day", "hour", "origin")
## Joining, by = "name"
## # A tibble: 100 × 26
## year month day hour origin dest tailnum carrier name temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Unite… 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA Unite… 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA Ameri… 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBl… 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL Delta… 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA Unite… 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 JetBl… 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV Expre… 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 JetBl… 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA Ameri… 39.9 25.0 54.8
## # … with 90 more rows, and 14 more variables: wind_dir <dbl>, wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>, faa <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## # dst <chr>, tzone <chr>
subflights<-subset(flights, dest %in% c("ALB", "BDL", "BTV"))
subflights
## # A tibble: 3,471 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 908 910 -2 1020 1027
## 2 2013 1 1 1111 1115 -4 1222 1226
## 3 2013 1 1 1202 1207 -5 1318 1314
## 4 2013 1 1 1315 1317 -2 1413 1423
## 5 2013 1 1 1318 1322 -4 1358 1416
## 6 2013 1 1 1655 1621 34 1804 1724
## 7 2013 1 1 1711 1650 21 1820 1806
## 8 2013 1 1 1842 1422 260 1958 1535
## 9 2013 1 1 2056 2004 52 2156 2112
## 10 2013 1 1 2302 2200 62 2342 2253
## # … with 3,461 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(subflights,carrier,dest)
## # A tibble: 3,471 × 2
## carrier dest
## <chr> <chr>
## 1 B6 BTV
## 2 B6 BTV
## 3 EV BTV
## 4 EV ALB
## 5 EV BDL
## 6 EV ALB
## 7 EV BTV
## 8 EV BTV
## 9 EV ALB
## 10 EV BDL
## # … with 3,461 more rows
count(subflights,dest)
## # A tibble: 3 × 2
## dest n
## <chr> <int>
## 1 ALB 439
## 2 BDL 443
## 3 BTV 2589
by_com <- group_by(flights, carrier,month,origin)
summarise(by_com, delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'carrier', 'month'. You can override using the `.groups` argument.
## # A tibble: 399 × 4
## # Groups: carrier, month [185]
## carrier month origin delay
## <chr> <int> <chr> <dbl>
## 1 9E 1 EWR 12.9
## 2 9E 1 JFK 17.1
## 3 9E 1 LGA 17.4
## 4 9E 2 EWR -1.18
## 5 9E 2 JFK 18.0
## 6 9E 2 LGA 6.08
## 7 9E 3 EWR 5.6
## 8 9E 3 JFK 14.4
## 9 9E 3 LGA 6.95
## 10 9E 4 EWR 5.87
## # … with 389 more rows
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.