#install.packages("nycflights13")
library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## year 1 336776 2013.00 0.00 2013 2013.00 0.00 2013 2013
## month 2 336776 6.55 3.41 7 6.56 4.45 1 12
## day 3 336776 15.71 8.77 16 15.70 11.86 1 31
## dep_time 4 328521 1349.11 488.28 1401 1346.82 634.55 1 2400
## sched_dep_time 5 336776 1344.25 467.34 1359 1341.60 613.80 106 2359
## dep_delay 6 328521 12.64 40.21 -2 3.32 5.93 -43 1301
## arr_time 7 328063 1502.05 533.26 1535 1526.42 619.73 1 2400
## sched_arr_time 8 336776 1536.38 497.46 1556 1550.67 618.24 1 2359
## arr_delay 9 327346 6.90 44.63 -5 -1.03 20.76 -86 1272
## carrier* 10 336776 7.14 4.14 6 7.00 5.93 1 16
## flight 11 336776 1971.92 1632.47 1496 1830.51 1608.62 1 8500
## tailnum* 12 334264 1814.32 1199.75 1798 1778.21 1587.86 1 4043
## origin* 13 336776 1.95 0.82 2 1.94 1.48 1 3
## dest* 14 336776 50.03 28.12 50 49.56 32.62 1 105
## air_time 15 327346 150.69 93.69 129 140.03 75.61 20 695
## distance 16 336776 1039.91 733.23 872 955.27 569.32 17 4983
## hour 17 336776 13.18 4.66 13 13.15 5.93 1 23
## minute 18 336776 26.23 19.30 29 25.64 23.72 0 59
## time_hour 19 336776 NaN NA NA NaN NA Inf -Inf
## range skew kurtosis se
## year 0 NaN NaN 0.00
## month 11 -0.01 -1.19 0.01
## day 30 0.01 -1.19 0.02
## dep_time 2399 -0.02 -1.09 0.85
## sched_dep_time 2253 -0.01 -1.20 0.81
## dep_delay 1344 4.80 43.95 0.07
## arr_time 2399 -0.47 -0.19 0.93
## sched_arr_time 2358 -0.35 -0.38 0.86
## arr_delay 1358 3.72 29.23 0.08
## carrier* 15 0.36 -1.21 0.01
## flight 8499 0.66 -0.85 2.81
## tailnum* 4042 0.17 -1.24 2.08
## origin* 2 0.09 -1.50 0.00
## dest* 104 0.13 -1.08 0.05
## air_time 675 1.07 0.86 0.16
## distance 4966 1.13 1.19 1.26
## hour 22 0.00 -1.21 0.01
## minute 59 0.09 -1.24 0.03
## time_hour -Inf NA NA NA
flights<-flights
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
unique(flights$origin)
## [1] "EWR" "LGA" "JFK"
unique(flights$dest)
## [1] "IAH" "MIA" "BQN" "ATL" "ORD" "FLL" "IAD" "MCO" "PBI" "TPA" "LAX" "SFO"
## [13] "DFW" "BOS" "LAS" "MSP" "DTW" "RSW" "SJU" "PHX" "BWI" "CLT" "BUF" "DEN"
## [25] "SNA" "MSY" "SLC" "XNA" "MKE" "SEA" "ROC" "SYR" "SRQ" "RDU" "CMH" "JAX"
## [37] "CHS" "MEM" "PIT" "SAN" "DCA" "CLE" "STL" "MYR" "JAC" "MDW" "HNL" "BNA"
## [49] "AUS" "BTV" "PHL" "STT" "EGE" "AVL" "PWM" "IND" "SAV" "CAK" "HOU" "LGB"
## [61] "DAY" "ALB" "BDL" "MHT" "MSN" "GSO" "CVG" "BUR" "RIC" "GSP" "GRR" "MCI"
## [73] "ORF" "SAT" "SDF" "PDX" "SJC" "OMA" "CRW" "OAK" "SMF" "TUL" "TYS" "OKC"
## [85] "PVD" "DSM" "PSE" "BHM" "CAE" "HDN" "BZN" "MTJ" "EYW" "PSP" "ACK" "BGR"
## [97] "ABQ" "ILM" "MVY" "SBN" "LEX" "CHO" "TVC" "ANC" "LGA"
# select flights to DCA
DCA_flights <- filter(flights, dest == "DCA")
# summarize data by month
dca_by_months <- group_by(DCA_flights, month)
dca_sum <- summarise(dca_by_months,
mean_air_time= mean(air_time, na.rm = TRUE),
median_air_time = median(air_time, na.rm = TRUE),
sd_air_time=sd(air_time, na.rm = TRUE),
min_air_time = min(air_time, na.rm = TRUE),
max_air_time = max(air_time, na.rm = TRUE)
)
# round the data on two digits
dca_sum[,2:ncol(dca_sum)] <- round(dca_sum[,2:ncol(dca_sum)], digits = 2)
# select flights to IAD
IAD_flights <- filter(flights, dest == "IAD")
# summarize data by month
iad_by_months <- group_by(IAD_flights, month)
iad_sum <- summarise(iad_by_months,
mean_air_time= mean(air_time, na.rm = TRUE),
median_air_time = median(air_time, na.rm = TRUE),
sd_air_time=sd(air_time, na.rm = TRUE),
min_air_time = min(air_time, na.rm = TRUE),
max_air_time = max(air_time, na.rm = TRUE)
)
# round the data on two digits
iad_sum[,2:ncol(iad_sum)] <- round(iad_sum[,2:ncol(iad_sum)], digits = 2)
# load the package
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# apply the function months() to the column month
dca_sum$month <- months(parse_date_time(dca_sum$month, "m"))
iad_sum$month <- months(parse_date_time(iad_sum$month, "m"))
# change column headers
names(dca_sum) <- c ("Month", "Mean", "Median", "Std. Deviation", "Min. Value", "Max. Value")
# change column headers
names(iad_sum) <- c("Month", "Mean", "Median", "Std. Deviation", "Min. Value", "Max. Value")
The kable function is a table generator for RMarkdown. The function can be further specified to produce code in latex, html, etc. See ?kable for more information.
print(dca_sum)
## # A tibble: 12 x 6
## Month Mean Median `Std. Deviation` `Min. Value` `Max. Value`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 January 47.8 47 6.14 36 91
## 2 February 46.9 46 5.31 37 70
## 3 March 45.5 45 5.43 32 83
## 4 April 46.7 45.5 6.54 36 105
## 5 May 45.0 44 6.5 34 92
## 6 June 46.7 45 8.67 35 131
## 7 July 45.2 43 6.93 32 100
## 8 August 44.7 43 6.73 34 94
## 9 September 43.2 42 5.15 34 85
## 10 October 44.8 44 5.95 32 96
## 11 November 46.1 45 5.72 36 72
## 12 December 48.2 47 6.21 37 77
#row.names = FALSE,
#digits = 2,
# caption = "Summary Statistics: Air time (in minutes) for flights between NYC and DCA (2013)" )
print(iad_sum)
## # A tibble: 12 x 6
## Month Mean Median `Std. Deviation` `Min. Value` `Max. Value`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 January 49.7 49 4.8 40 68
## 2 February 49.2 49 4.09 41 74
## 3 March 48.0 48 4 39 69
## 4 April 48.2 48 4.54 39 75
## 5 May 45.6 45 5.91 37 93
## 6 June 46.9 46 6.07 38 85
## 7 July 46.8 45 6.95 36 88
## 8 August 47.3 46 7.4 38 91
## 9 September 46.2 46 5.49 38 78
## 10 October 48.6 48 5.41 39 97
## 11 November 48.4 48 4.22 38 64
## 12 December 49.6 49 5.66 40 83
# select subset
plotdata <- select(flights, month, Destination = dest, arr_delay, dep_delay)
plotdata <- na.omit(filter(plotdata, month == 6 | month == 12, Destination %in% c("DCA", "IAD")))
# format months
plotdata$month <- months(parse_date_time(plotdata$month, "m"))
# plot
flights_plot <-
ggplot(plotdata, aes(x=dep_delay, y=arr_delay)) +
geom_point(aes(colour=Destination)) +
geom_smooth(colour="black") +
facet_wrap(~month) +
theme_minimal() +
ylab("Arrival Delay (in minutes)") +
xlab("Departure Delay (in minutes)") +
theme(legend.position = "top")
flights_plot
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.