#install.packages("nycflights13")

library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA
flights<-flights
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
unique(flights$origin)
## [1] "EWR" "LGA" "JFK"
unique(flights$dest)
##   [1] "IAH" "MIA" "BQN" "ATL" "ORD" "FLL" "IAD" "MCO" "PBI" "TPA" "LAX" "SFO"
##  [13] "DFW" "BOS" "LAS" "MSP" "DTW" "RSW" "SJU" "PHX" "BWI" "CLT" "BUF" "DEN"
##  [25] "SNA" "MSY" "SLC" "XNA" "MKE" "SEA" "ROC" "SYR" "SRQ" "RDU" "CMH" "JAX"
##  [37] "CHS" "MEM" "PIT" "SAN" "DCA" "CLE" "STL" "MYR" "JAC" "MDW" "HNL" "BNA"
##  [49] "AUS" "BTV" "PHL" "STT" "EGE" "AVL" "PWM" "IND" "SAV" "CAK" "HOU" "LGB"
##  [61] "DAY" "ALB" "BDL" "MHT" "MSN" "GSO" "CVG" "BUR" "RIC" "GSP" "GRR" "MCI"
##  [73] "ORF" "SAT" "SDF" "PDX" "SJC" "OMA" "CRW" "OAK" "SMF" "TUL" "TYS" "OKC"
##  [85] "PVD" "DSM" "PSE" "BHM" "CAE" "HDN" "BZN" "MTJ" "EYW" "PSP" "ACK" "BGR"
##  [97] "ABQ" "ILM" "MVY" "SBN" "LEX" "CHO" "TVC" "ANC" "LGA"
# select flights to DCA
DCA_flights <- filter(flights, dest == "DCA")

# summarize data by month
dca_by_months <- group_by(DCA_flights, month)
dca_sum <- summarise(dca_by_months,
                    mean_air_time= mean(air_time, na.rm = TRUE),
                    median_air_time = median(air_time, na.rm = TRUE),
                    sd_air_time=sd(air_time, na.rm = TRUE),
                    min_air_time = min(air_time, na.rm = TRUE),
                    max_air_time = max(air_time, na.rm = TRUE)
                    )

# round the data on two digits
dca_sum[,2:ncol(dca_sum)] <- round(dca_sum[,2:ncol(dca_sum)], digits = 2)
# select flights to IAD
IAD_flights <- filter(flights, dest == "IAD")

# summarize data by month
iad_by_months <- group_by(IAD_flights, month)
iad_sum <- summarise(iad_by_months,
                    mean_air_time= mean(air_time, na.rm = TRUE),
                    median_air_time = median(air_time, na.rm = TRUE),
                    sd_air_time=sd(air_time, na.rm = TRUE),
                    min_air_time = min(air_time, na.rm = TRUE),
                    max_air_time = max(air_time, na.rm = TRUE)
                    )

# round the data on two digits
iad_sum[,2:ncol(iad_sum)] <- round(iad_sum[,2:ncol(iad_sum)], digits = 2)
# load the package 
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# apply the function months() to the column month
dca_sum$month <- months(parse_date_time(dca_sum$month, "m"))
iad_sum$month <- months(parse_date_time(iad_sum$month, "m"))
# change column headers
names(dca_sum) <- c ("Month", "Mean", "Median", "Std. Deviation", "Min. Value", "Max. Value")
# change column headers
names(iad_sum) <- c("Month", "Mean", "Median", "Std. Deviation", "Min. Value", "Max. Value")

The kable function is a table generator for RMarkdown. The function can be further specified to produce code in latex, html, etc. See ?kable for more information.

 print(dca_sum)
## # A tibble: 12 x 6
##    Month      Mean Median `Std. Deviation` `Min. Value` `Max. Value`
##    <chr>     <dbl>  <dbl>            <dbl>        <dbl>        <dbl>
##  1 January    47.8   47               6.14           36           91
##  2 February   46.9   46               5.31           37           70
##  3 March      45.5   45               5.43           32           83
##  4 April      46.7   45.5             6.54           36          105
##  5 May        45.0   44               6.5            34           92
##  6 June       46.7   45               8.67           35          131
##  7 July       45.2   43               6.93           32          100
##  8 August     44.7   43               6.73           34           94
##  9 September  43.2   42               5.15           34           85
## 10 October    44.8   44               5.95           32           96
## 11 November   46.1   45               5.72           36           72
## 12 December   48.2   47               6.21           37           77
      #row.names = FALSE,
      #digits = 2,
     # caption = "Summary Statistics: Air time (in minutes) for flights between NYC and DCA (2013)" )
print(iad_sum)
## # A tibble: 12 x 6
##    Month      Mean Median `Std. Deviation` `Min. Value` `Max. Value`
##    <chr>     <dbl>  <dbl>            <dbl>        <dbl>        <dbl>
##  1 January    49.7     49             4.8            40           68
##  2 February   49.2     49             4.09           41           74
##  3 March      48.0     48             4              39           69
##  4 April      48.2     48             4.54           39           75
##  5 May        45.6     45             5.91           37           93
##  6 June       46.9     46             6.07           38           85
##  7 July       46.8     45             6.95           36           88
##  8 August     47.3     46             7.4            38           91
##  9 September  46.2     46             5.49           38           78
## 10 October    48.6     48             5.41           39           97
## 11 November   48.4     48             4.22           38           64
## 12 December   49.6     49             5.66           40           83
# select subset
plotdata <- select(flights, month, Destination = dest, arr_delay, dep_delay)
plotdata <- na.omit(filter(plotdata, month == 6 | month == 12, Destination %in% c("DCA", "IAD")))

# format months
plotdata$month <- months(parse_date_time(plotdata$month, "m"))
# plot
flights_plot <- 
  ggplot(plotdata, aes(x=dep_delay, y=arr_delay)) +
  geom_point(aes(colour=Destination)) +
  geom_smooth(colour="black") +
  facet_wrap(~month) + 
  theme_minimal() +
  ylab("Arrival Delay (in minutes)") +
  xlab("Departure Delay (in minutes)") +
  theme(legend.position = "top")

flights_plot
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.