mean_median_function <- function(x){
if (is.numeric(x) == FALSE) {
stop("Input is not numeric vector.")
}
m <- c(x[x>median(x)])
mean_median <- mean(m)
mean_median
}
To test the function, we can use the function on the following simple example:
x_test <-c(1,2,3,4,5)
mean_median_function(x_test)
## [1] 4.5
As we can see, the function returns the expected values of 4.5.
df_col <- function(df){
#define logical vectors of length ncol(df)
num <- vector(mode = "logical", length = ncol(df))
int <- vector(mode = "logical", length = ncol(df))
char <- vector(mode = "logical", length = ncol(df))
log <- vector(mode = "logical", length = ncol(df))
fac <- vector(mode = "logical", length = ncol(df))
#fill the vectors via sapply function
num <- sapply(df, is.numeric)
int <- sapply(df, is.integer)
char <- sapply(df, is.character)
log <- sapply(df, is.logical)
fac <- sapply(df, is.factor)
#use the sum function to count the number of "TRUE"
num_sum <- sum(num)
num_int <- sum(int)
num_char <- sum(char)
num_log <- sum(log)
num_fac <- sum(fac)
#output results
labels_out <- c("Numeric", "Integer", "Character", "Logistic", "Factor")
nums_out <- c(num_sum, num_int, num_char, num_log, num_fac)
output <- list(labels_out, nums_out)
output
}
To test, df_col we can use the previously explored data.frame “mtcars”, which we know is comprised of 11 numeric columns.
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
df_col(mtcars)
## [[1]]
## [1] "Numeric" "Integer" "Character" "Logistic" "Factor"
##
## [[2]]
## [1] 11 0 0 0 0
Load data in the workspace by executing:
source("http://www3.nd.edu/~steve/computing_with_data_2014/9_Functions_matrices_lists/work_along_data_S9.R")
#Write a function that returns the sum of nonnegative numbers
pos_sum <- function(x){
sum(x[x > 0])
}
#With the defined function, we can drastically simplify nonnegative column sums
col_pos_sums <- apply(mat2, 2, pos_sum)
col_pos_sums
## [1] 0.2352207 0.1402782 3.0372189 0.3641867 1.2249126 3.5075542
Set the working directory:
setwd("~/Homeworks_SU2015/ontime_flight_data")
Now read in the 3 .csv files:
air_names <- read.csv(file = "air_carrier_names.csv")
air_codes <- read.csv(file = "airport_codes.csv")
on_time <- read.csv(file = "ONTIME1.csv")
attach(air_codes)
attach(on_time)
names(on_time)
## [1] "YEAR" "MONTH"
## [3] "DAY_OF_MONTH" "AIRLINE_ID"
## [5] "CARRIER" "ORIGIN_AIRPORT_ID"
## [7] "ORIGIN_AIRPORT_SEQ_ID" "ORIGIN_CITY_MARKET_ID"
## [9] "DEST_AIRPORT_ID" "DEST_AIRPORT_SEQ_ID"
## [11] "DEST_CITY_MARKET_ID" "ARR_TIME"
## [13] "ARR_DELAY" "CARRIER_DELAY"
## [15] "WEATHER_DELAY" "NAS_DELAY"
## [17] "SECURITY_DELAY" "LATE_AIRCRAFT_DELAY"
## [19] "X"
We observe that “CARRIER” is the 5th column of the data.frame.
num_NAs_1 <- apply(on_time, 2, is.na)[,5]
The sum function will return the number of “TRUE”s i.e. If the sum is greater than 0, then there are some flights with missing CARRIER information.
sum(num_NAs_1)
## [1] 0
Therefore, no flights are missing CARRIER variable information.
First we use the table function to obtain a listing of all of the carrier codes used in the on_time data.frame.
table(on_time$CARRIER)
##
## 9E AA AS B6 DL EV F9 FL HA MQ OO UA
## 23658 44848 12020 19291 57182 57934 5804 15313 5982 37693 51762 40419
## US VX WN YV
## 33567 4230 89169 10647
Now, we need to check if any of these codes do not appear in the data.frame air_names
codes_an <- air_names[air_names$Code %in% on_time$CARRIER, ]
codes_an
## Code Description
## 136 9E Pinnacle Airlines Inc. (2002 - 2013)
## 137 9E Endeavor Air Inc. (2013 - )
## 156 AA American Airlines Inc. (1960 - )
## 291 AS Alaska Airlines Inc. (1960 - )
## 338 B6 JetBlue Airways (2000 - )
## 550 DL Delta Air Lines Inc. (1960 - )
## 623 EV ExpressJet Airlines Inc. (2012 - )
## 624 EV Atlantic Southeast Airlines (1993 - 2011)
## 639 F9 Frontier Airlines Inc. (1994 - )
## 663 FL Frontier Airlines Inc. (1960 - 1986)
## 664 FL AirTran Airways Corporation (1994 - )
## 754 HA Hawaiian Airlines Inc. (1960 - )
## 1020 MQ Simmons Airlines (1991 - 1998)
## 1021 MQ American Eagle Airlines Inc. (1998 - )
## 1137 OO SkyWest Airlines Inc. (2003 - )
## 1550 UA United Air Lines Inc. (1960 - )
## 1561 US US Airways Inc. (1997 - )
## 1562 US USAir (1988 - 1997)
## 1615 VX Virgin America (2007 - )
## 1616 VX Aces Airlines (1992 - 2003)
## 1659 WN Southwest Airlines Co. (1979 - )
## 1760 YV Mesa Airlines Inc. (1995 - )
As we can see there are no codes used are not found in the carrier look-up table
First we create a subset of all of the flights in the database that occurred during January 2013
subset_2013 <- subset(on_time, on_time$YEAR == 2013)
subset_jan <- subset(on_time, on_time$MONTH == 1)
We observe that the subsets for both MONTH and YEAR are equivalent to the original data.frame of on_time. Thus, we can conclude that rows in the data.frame represent flights from January 2013.
Thus, we use the unique and length functions to obtain the number of airlines
length(unique(on_time$CARRIER))
## [1] 16
df_codes_ontime <- merge(x = on_time, y = air_codes, by.x = "ORIGIN_AIRPORT_ID", by.y = "Code")
Now change the column name of the descriptions to something more descriptive
colnames(df_codes_ontime)[20] <- "ORIGIN_AIRPORT_DESCRIPTION"
str(df_codes_ontime)
## 'data.frame': 509519 obs. of 20 variables:
## $ ORIGIN_AIRPORT_ID : int 10135 10135 10135 10135 10135 10135 10135 10135 10135 10135 ...
## $ YEAR : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY_OF_MONTH : int 15 22 20 22 21 28 17 11 28 10 ...
## $ AIRLINE_ID : int 20366 20378 20366 20366 20366 20366 20366 20366 20366 20366 ...
## $ CARRIER : Factor w/ 16 levels "9E","AA","AS",..: 6 16 6 6 6 6 6 6 6 6 ...
## $ ORIGIN_AIRPORT_SEQ_ID : int 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 ...
## $ ORIGIN_CITY_MARKET_ID : int 30135 30135 30135 30135 30135 30135 30135 30135 30135 30135 ...
## $ DEST_AIRPORT_ID : int 13930 11057 10397 13930 13930 13930 13930 13930 13930 13930 ...
## $ DEST_AIRPORT_SEQ_ID : int 1393002 1105703 1039705 1393002 1393002 1393002 1393002 1393002 1393002 1393002 ...
## $ DEST_CITY_MARKET_ID : int 30977 31057 30397 30977 30977 30977 30977 30977 30977 30977 ...
## $ ARR_TIME : int 1257 1148 840 1321 1318 1937 2026 1240 1253 1226 ...
## $ ARR_DELAY : num 7 72 -17 31 28 12 61 -10 3 -24 ...
## $ CARRIER_DELAY : num NA 72 NA 0 6 NA 2 NA NA NA ...
## $ WEATHER_DELAY : num NA 0 NA 0 0 NA 0 NA NA NA ...
## $ NAS_DELAY : num NA 0 NA 31 7 NA 0 NA NA NA ...
## $ SECURITY_DELAY : num NA 0 NA 0 0 NA 0 NA NA NA ...
## $ LATE_AIRCRAFT_DELAY : num NA 0 NA 0 15 NA 59 NA NA NA ...
## $ X : logi NA NA NA NA NA NA ...
## $ ORIGIN_AIRPORT_DESCRIPTION: Factor w/ 6232 levels "47-Mile Mine, AK: 47-Mile Mine Airport",..: 134 134 134 134 134 134 134 134 134 134 ...
The United Airlines code is “UA”. Thus we create a sub data.frame for the United Airlines flights:
UA_sub <- subset(on_time, on_time$CARRIER == 'UA')
head(UA_sub)
## YEAR MONTH DAY_OF_MONTH AIRLINE_ID CARRIER ORIGIN_AIRPORT_ID
## 325291 2013 1 31 19977 UA 13930
## 325292 2013 1 31 19977 UA 12266
## 325293 2013 1 31 19977 UA 12889
## 325294 2013 1 31 19977 UA 11618
## 325295 2013 1 31 19977 UA 13930
## 325296 2013 1 31 19977 UA 14683
## ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID DEST_AIRPORT_ID
## 325291 1393002 30977 11278
## 325292 1226603 31453 14635
## 325293 1288903 32211 12266
## 325294 1161802 31703 11292
## 325295 1393002 30977 11278
## 325296 1468303 33214 11618
## DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID ARR_TIME ARR_DELAY
## 325291 1127802 30852 1232 -24
## 325292 1463502 31714 1627 -19
## 325293 1226603 31453 1053 -7
## 325294 1129202 30325 2113 28
## 325295 1127802 30852 1602 2
## 325296 1161802 31703 1907 137
## CARRIER_DELAY WEATHER_DELAY NAS_DELAY SECURITY_DELAY
## 325291 NA NA NA NA
## 325292 NA NA NA NA
## 325293 NA NA NA NA
## 325294 0 0 28 0
## 325295 NA NA NA NA
## 325296 0 0 137 0
## LATE_AIRCRAFT_DELAY X
## 325291 NA NA
## 325292 NA NA
## 325293 NA NA
## 325294 0 NA
## 325295 NA NA
## 325296 0 NA
length(UA_sub)
## [1] 19
Early departures are negative numbers in the ARR_DELAY in on_time data.frame. If ARR_DELAY > 0, there was a delay.
Fill column according to whether the ARR_DELAY is positive (and thus delayed) or negative (and thus early)
neg_logical <- function(x){
ifelse(x > 0, T, F)
}
UA_sub$ANY_DELAY <- sapply(UA_sub$ARR_DELAY, neg_logical)
head(UA_sub)[13]
## ARR_DELAY
## 325291 -24
## 325292 -19
## 325293 -7
## 325294 28
## 325295 2
## 325296 137
head(UA_sub)[20]
## ANY_DELAY
## 325291 FALSE
## 325292 FALSE
## 325293 FALSE
## 325294 TRUE
## 325295 TRUE
## 325296 TRUE
We notice that the negative ARR_DELAY values correspond to FALSE, meaning the flight experienced no delays, which is expected since the negative values indicate early departures.
delay_sum <- sapply(UA_sub$ANY_DELAY, sum)
percentage <- (sum(delay_sum, na.rm=TRUE)/length(UA_sub$ANY_DELAY))*100
percentage
## [1] 33.68465
Using the data.frame from 8
sorted_UA_sub <- UA_sub[order(-UA_sub$ARR_DELAY), ]
Alternativeltly, using the data.frame from 7 with United Airlines only subset
UA_df_sub <- subset(df_codes_ontime, df_codes_ontime$CARRIER == 'UA')
sorted_df_sub <- UA_df_sub[order(-UA_df_sub$ARR_DELAY),]
Using the data.frame from 8
head(sorted_UA_sub, n = 3)[6]
## ORIGIN_AIRPORT_ID
## 335156 13830
## 347081 13930
## 361653 13930
Using the modified data.frame from 7, we can get the names of the airports instead of the airline codes, which we would need to look up in the table.
names(sorted_df_sub)
## [1] "ORIGIN_AIRPORT_ID" "YEAR"
## [3] "MONTH" "DAY_OF_MONTH"
## [5] "AIRLINE_ID" "CARRIER"
## [7] "ORIGIN_AIRPORT_SEQ_ID" "ORIGIN_CITY_MARKET_ID"
## [9] "DEST_AIRPORT_ID" "DEST_AIRPORT_SEQ_ID"
## [11] "DEST_CITY_MARKET_ID" "ARR_TIME"
## [13] "ARR_DELAY" "CARRIER_DELAY"
## [15] "WEATHER_DELAY" "NAS_DELAY"
## [17] "SECURITY_DELAY" "LATE_AIRCRAFT_DELAY"
## [19] "X" "ORIGIN_AIRPORT_DESCRIPTION"
head(sorted_df_sub, n = 3)[20]
## ORIGIN_AIRPORT_DESCRIPTION
## 354267 Kahului, HI: Kahului Airport
## 374129 Chicago, IL: Chicago O'Hare International
## 381311 Chicago, IL: Chicago O'Hare International