############ Independent Study_Yanxin Li ################
######## Work Along - Vectors ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/3_Vectors/work_along_data_S3.R")
# What is the length of x? 47
length(x)
## [1] 47
# What is the class of x? numeric
class(x)
## [1] "numeric"
# What is the name of the 4th entry in x? N4
names(x[4])
## [1] "N4"
# What are the mean, median and 3rd quartile of x? 1.38500
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.15900 -1.24300 0.25190 -0.04026 1.38500 3.08500
# What are the possible values for an entry of y? 0, 1
y1 <- as.numeric(y)
y1
## [1] 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1
# How many of each value are there?
sum(y) # number of 1 (12)
## [1] 12
length(y)-sum(y) # number of 0 (6)
## [1] 6
# How many entries in x are less than 0? 21
sum(x < 0)
## [1] 21
# Form a new vector of all entries in x greater than the median
x.sub <- subset(x, x > median(x))
x.sub
## N1 N2 N7 N8 N10 N13 N14
## 0.9886213 1.0515351 1.4071772 0.3987219 1.7079851 2.3304798 1.8955924
## N17 N19 N20 N21 N22 N23 N26
## 0.7890702 2.8608310 2.1563374 1.3703668 0.6488079 1.3986529 1.1803058
## N28 N29 N33 N35 N39 N41 N43
## 0.4512312 1.0588708 1.7227900 0.6428734 1.7454097 3.0852416 0.3280050
## N46 N47
## 1.5686945 3.0681204
# What is the sum of v1? 28
sum(v1)
## [1] 28
# What is the sum of v2? 17
sum(v2, na.rm = TRUE)
## [1] 17
######## Work Along - Matrices ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/4_Matrices/work_along_data_S4.R")
# How many rows and columns are in M1? row:2, column:6
nrow(M1)
## [1] 2
ncol(M1)
## [1] 6
# Rearrange the underlying entries of M1 as a 4 x 3 matrix
M1.rea <- matrix(1:12,4,3)
# Replace the NA values in M2 by 0's
M2
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 NA 9 13 17
## [2,] 2 6 10 NA 18
## [3,] 3 7 11 15 NA
## [4,] 4 8 12 16 20
M2[1,2] = 0; M2[2,4] = 0; M2[3,5] = 0
# Form a matrix that binds the columns of M1 and the matrix z
# Check that the dimensions match first
M3 <- cbind(M1,z); M3
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "1" "3" "5" "7" "9" "11" "a" "a"
## [2,] "2" "4" "6" "8" "10" "12" "a" "a"
M4 <- rbind(t(M1),z); M4
## [,1] [,2]
## [1,] "1" "2"
## [2,] "3" "4"
## [3,] "5" "6"
## [4,] "7" "8"
## [5,] "9" "10"
## [6,] "11" "12"
## [7,] "a" "a"
## [8,] "a" "a"
dim(M1); dim(z)
## [1] 2 6
## [1] 2 2
nrow(z); ncol(z)
## [1] 2
## [1] 2
# What happened to the numbers?
dim(M3); dim(M4)
## [1] 2 8
## [1] 8 2
######## Work Along - Data Frames ########
# Display the variables and variable classes for the data.frame mtcars
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
class(mtcars)
## [1] "data.frame"
class(mtcars[3:7,4])
## [1] "numeric"
class(mtcars[4:8,3:10])
## [1] "data.frame"
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
dim(mtcars)
## [1] 32 11
# How many cars are there for each possible choice of number of cylinders.
sum(mtcars$cyl == 4) # 11
## [1] 11
sum(mtcars$cyl == 6) # 7
## [1] 7
sum(mtcars$cyl == 8) # 14
## [1] 14
# Create a new data.frame that adds a column for the new computed variable mpg/cyl
mpg.cyl <- mtcars$mpg/mtcars$cyl
mtcars.new <- data.frame(mtcars, mpg.cyl)
head(mtcars.new)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## mpg.cyl
## Mazda RX4 3.500000
## Mazda RX4 Wag 3.500000
## Datsun 710 5.700000
## Hornet 4 Drive 3.566667
## Hornet Sportabout 2.337500
## Valiant 3.016667
# What is the mean horsepower (hp) for the cars with 4 gears?
data <- mtcars[mtcars$gear == 4,]
mean(data$hp)
## [1] 89.5
######## Work along - Loops, etc ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/7_Loops_etc/work_along_data_S7.R")
# Compute a sequence r of length 100 such that r[1] = 0 and r[i+1] is randomly sampled from a normal distribution with mean r[i] and standard deviation 1.
# Inspect the first 5 entries and the last 5 entries. Any patterns?
r <- c()
r[1] = 0
for(i in 1:99){
r[i+1] = rnorm(1,r[i],1)
}
r[1:5]; tail(r, n=5); class(r[1:7])
## [1] 0.0000000 -0.4293217 1.1315557 1.9354511 1.8365423
## [1] 0.5505827 0.9085822 1.2888064 0.6252272 1.0861533
## [1] "numeric"
# prob_L is list of character vectors. Use a for loop to define a vector p1 such that p1[j] is the first entry of the \(j^{th}\) component of prob_L.
prob_L
## [[1]]
## [1] "word" "word2"
##
## [[2]]
## [1] "A" "2"
##
## [[3]]
## [1] "8"
##
## [[4]]
## [1] FALSE
##
## [[5]]
## [1] "yes" "no" "maybe-so"
prob_L[[1]][1]
## [1] "word"
p1 <- c()
for (i in 1:5){
p1[i] <- prob_L[[i]][1]
}
p1
## [1] "word" "A" "8" "FALSE" "yes"
# Create a vector where the \(j-\)entry is the second entry of the \(j^{th}\) component of prob_L.
p2 <- c()
for (i in 1:5){
p2[i] <- prob_L[[i]][2]
}
p2 <- p2[c(1,2,5)]
p2
## [1] "word2" "2" "no"
######## Work along - Defining functions ########
# Write a function of one variable that for an input numeric vector, computes the mean of the samples above the median.
# Check that the input is a numeric vector and if it isn't,
# return an appropriate message about the wrong function argument.
u <- runif(500)
median(u)
## [1] 0.4883122
mean_u <- c()
for (i in 1:500){
if(median(u)<u[i])
mean_u[i] = u[i]
else if (median(u)>=u[i])
mean_u[i] = 0
}
mean(mean_u)
## [1] 0.3727057
class(u)
## [1] "numeric"
class(mean_u)
## [1] "numeric"
# Write a function that given a data.frame, reports the numbers of columns of each possible class.
# The possible classes are numeric, integer, character, logical, factor
setwd('D:\\R Code')
Precip = read.table("Precip.txt")
colnames(Precip) <- c("Coast","City","State","April","May","Se","Xe","Xw","Logic")
Precip$City <- as.character(Precip$City)
Precip
## Coast City State April May Se Xe Xw Logic
## 1 East Albany NY 2.9 3.3 1 2.9 0.0 TRUE
## 2 East Washington DC 3.1 3.6 1 3.1 0.0 TRUE
## 3 East Jacksonville FL 3.3 4.9 1 3.3 0.0 TRUE
## 4 East Raleigh NC 2.9 3.7 1 2.9 0.0 TRUE
## 5 East Burlington VT 2.8 3.0 1 2.8 0.0 TRUE
## 6 West Los_Angeles CA 1.2 0.2 0 0.0 1.2 FALSE
## 7 West Seattle WA 2.4 1.6 0 0.0 2.4 FALSE
## 8 West Portland OR 2.3 2.1 0 0.0 2.3 FALSE
## 9 West San_Diego CA 2.6 1.5 0 0.0 2.6 FALSE
## 10 West Fresno CA 1.2 0.3 0 0.0 1.2 FALSE
str(Precip)
## 'data.frame': 10 obs. of 9 variables:
## $ Coast: Factor w/ 2 levels "East","West": 1 1 1 1 1 2 2 2 2 2
## $ City : chr "Albany" "Washington" "Jacksonville" "Raleigh" ...
## $ State: Factor w/ 8 levels "CA","DC","FL",..: 5 2 3 4 7 1 8 6 1 1
## $ April: num 2.9 3.1 3.3 2.9 2.8 1.2 2.4 2.3 2.6 1.2
## $ May : num 3.3 3.6 4.9 3.7 3 0.2 1.6 2.1 1.5 0.3
## $ Se : int 1 1 1 1 1 0 0 0 0 0
## $ Xe : num 2.9 3.1 3.3 2.9 2.8 0 0 0 0 0
## $ Xw : num 0 0 0 0 0 1.2 2.4 2.3 2.6 1.2
## $ Logic: logi TRUE TRUE TRUE TRUE TRUE FALSE ...
dim(Precip)
## [1] 10 9
n = ncol(Precip)
n
## [1] 9
myfun <- function(Precip){
vec = rep()
for(i in 1:n ){
vec[i] = class(Precip[,i])
}
return(table(vec))
}
table <- myfun(Precip)
table
## vec
## character factor integer logical numeric
## 1 2 1 1 4
######## Work along - Functions on matrices and lists ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/9_Functions_matrices_lists/work_along_data_S9.R")
# Compute the sums of the non-negative values of each column of mat2.
mat1 <- c()
class(mat2)
## [1] "matrix"
s <- matrix(NA,5,6)
for(j in 1:6){
for(i in 1:5){
if(mat2[i,j]>=0)
s[i,j] <- mat2[i,j]
else if(mat2[i,j]<2)
s[i,j] <- 0
}
mat1[j] <-sum(s[,j])
}
# output
s
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.2352207 0.1402782 0.0000000 0.0000000 0.3616625 0.5982542
## [2,] 0.0000000 0.0000000 0.0000000 0.0000000 0.3469644 0.0000000
## [3,] 0.0000000 0.0000000 1.8456363 0.0000000 0.1897365 2.7180556
## [4,] 0.0000000 0.0000000 0.3940541 0.0000000 0.0000000 0.1912444
## [5,] 0.0000000 0.0000000 0.7975285 0.3641867 0.3265492 0.0000000
mat1
## [1] 0.2352207 0.1402782 3.0372189 0.3641867 1.2249126 3.5075542
######## Working with Data frames_Ontime flight data ########
setwd('D:\\R Code')
air_name <- read.csv(file="air_carrier_names.csv",header=TRUE)
codes <- read.csv(file="airport_codes.csv",header=TRUE)
ontime <- read.csv(file="ONTIME1.csv",header=TRUE)
carrier <- read.csv(file="CARRIER.csv",header=TRUE)
# 1. Are there any flights with missing CARRIER information?
# Answer: No
sum(is.na(ontime$CARRIER))
## [1] 0
# 2. Are there any flights with carrier codes that aren't found in the carrier look-up table?
# Answer: No
a <- levels(ontime$CARRIER)
b <- levels(carrier$Code)
a %in% b
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE
# 3. How many airlines had a flight in January 2013 in this database? HINT: Use unique
# Answer: 16
num.airlines <- unique(ontime$AIRLINE_ID)
num.airlines
## [1] 20363 19805 19930 20409 19790 20366 20436 20437 19690 20398 20304
## [12] 19977 20355 21171 19393 20378
length(num.airlines)
## [1] 16
# 4. Carefully read the help file for the merge function. Merge the ontime flight data
# with the airport codes to create a text field description of the origininating airport
# to the ontime flight data.frame. You'll need to specify the columns in the ontime flight
# data.frame and the airport code data.frame that you want to match for the merge operation.
# Also, set the name of the airport description column to a name that clearly describes what it is.
# When done, use the str command to exhibit the characteristics of the new data.frame.
# Note that this didn't introduce any new records.
c <- unique(ontime$ORIGIN_AIRPORT_ID)
c %in% codes$Code
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
length(c)
## [1] 306
merged.data <- merge(ontime, codes, by.x="ORIGIN_AIRPORT_ID", by.y="Code")
merged.data <- merged.data[,c(2,3,4,5,6,1,19,7,8,9,10,11,12,13,14,15,16,17,18)]
names(merged.data)[7]<-paste("CITY_STATE/COUNTRY_NAME OF AIRPORT")
str(merged.data)
## 'data.frame': 509519 obs. of 19 variables:
## $ YEAR : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ MONTH : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DAY_OF_MONTH : int 15 22 20 22 21 28 17 11 28 10 ...
## $ AIRLINE_ID : int 20366 20378 20366 20366 20366 20366 20366 20366 20366 20366 ...
## $ CARRIER : Factor w/ 16 levels "9E","AA","AS",..: 6 16 6 6 6 6 6 6 6 6 ...
## $ ORIGIN_AIRPORT_ID : int 10135 10135 10135 10135 10135 10135 10135 10135 10135 10135 ...
## $ CITY_STATE/COUNTRY_NAME OF AIRPORT: Factor w/ 6232 levels "47-Mile Mine, AK: 47-Mile Mine Airport",..: 134 134 134 134 134 134 134 134 134 134 ...
## $ ORIGIN_AIRPORT_SEQ_ID : int 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 ...
## $ ORIGIN_CITY_MARKET_ID : int 30135 30135 30135 30135 30135 30135 30135 30135 30135 30135 ...
## $ DEST_AIRPORT_ID : int 13930 11057 10397 13930 13930 13930 13930 13930 13930 13930 ...
## $ DEST_AIRPORT_SEQ_ID : int 1393002 1105703 1039705 1393002 1393002 1393002 1393002 1393002 1393002 1393002 ...
## $ DEST_CITY_MARKET_ID : int 30977 31057 30397 30977 30977 30977 30977 30977 30977 30977 ...
## $ ARR_TIME : int 1257 1148 840 1321 1318 1937 2026 1240 1253 1226 ...
## $ ARR_DELAY : int 7 72 -17 31 28 12 61 -10 3 -24 ...
## $ CARRIER_DELAY : int NA 72 NA 0 6 NA 2 NA NA NA ...
## $ WEATHER_DELAY : int NA 0 NA 0 0 NA 0 NA NA NA ...
## $ NAS_DELAY : int NA 0 NA 31 7 NA 0 NA NA NA ...
## $ SECURITY_DELAY : int NA 0 NA 0 0 NA 0 NA NA NA ...
## $ LATE_AIRCRAFT_DELAY : int NA 0 NA 0 15 NA 59 NA NA NA ...
# 5. Use the lookup table of carrier codes to find the code for United Airlines (You can do that in Excel).
# Create a sub-data.frame containing all United Airlines flights.
UA <- subset.data.frame(merged.data, CARRIER == "UA")
# 6. There isn't a field exactly specifying in a "yes" or "no" whether there was a "DELAY",
# so I'd like you to create one. Read the descriptions of the fields on the above website,
# look at samples of the records, and decide what property of one of the existing fields
# characterizes when there is a delay. Then create a new data.frame of the United Airlines
# flights with a new column that is a logical vector saying whether there is or isn't an official delay.
# Answer: Arrival delay: 15 minutes or more = TRUE
delay <- (UA$ARR_DELAY >= 15)
UAir <- cbind(UA,OFFICIAL_DELAY=delay)
sum(is.na(delay))
## [1] 300
sum(delay,na.rm = TRUE)
## [1] 6634
# 7. What percentage of United Airlines flight had a delay?
# Answer: About 16.4 percent of United Airlines has a delay.
percentage = sum(delay,na.rm = TRUE)/(length(UAir$YEAR - sum(is.na(delay))))
percentage
## [1] 0.1641307
# 8. Sort the data.frame generated in 5 by the amount of delay in decreasing order. HINT: use the order function.
UAIR <- UAir[order(UAir$ARR_DELAY, decreasing = TRUE),]
# 9. Which originating airports had the 3 longest delays for United Airlines in January 2013?
# Answer: No.1 January 24th, 13830, Kahului, HI: Kahului Airport 1017 minutes
# No.2 January 16th, 13930, Chicago, IL: Chicago O'Hare International 565 minutes
# No.3 January 3rd, 13930, Chicago, IL: Chicago O'Hare International 534 minutes
# 10. In doing this project, many natural questions should have occurred to you. Here are a few.
# a). Which airlines had the best on-time performance record?
# Answer: WN Southwest Airlines had the best on-time performance, 57378 times of flights were on-time.
flight <- merged.data[!is.na(merged.data$ARR_DELAY),] # Eliminate missing values in arrival delays
ontime_airlines <- subset(flight, flight$ARR_DELAY <= 0) # Subsetting on-time performance airlines
on_time <- table(ontime_airlines$CARRIER)
on_time
##
## 9E AA AS B6 DL EV F9 FL HA MQ OO UA
## 15236 27224 8020 11588 41020 31996 2321 11149 3881 21955 29612 26504
## US VX WN YV
## 20931 3352 57378 6590
ontime_record <- as.matrix(on_time)
decreasing_ontime_record <- ontime_record[order(ontime_record, decreasing = TRUE)]
decreasing_ontime_record
## [1] 57378 41020 31996 29612 27224 26504 21955 20931 15236 11588 11149
## [12] 8020 6590 3881 3352 2321
# If we evaluate the best on-time performance based on the average time, we conclude that carrier 9E:Allentown/Bethlehem/Easton, PA: Lehigh Valley International ranked the first.
library(plyr)
mean_ontime_carrier <- daply(ontime_airlines, .(CARRIER),function(df) mean(df$ARR_DELAY, na.rm=T))
mean_ontime_carrier
## 9E AA AS B6 DL EV
## -15.026057 -12.716390 -13.818454 -13.222126 -13.654461 -12.232060
## F9 FL HA MQ OO UA
## -8.333046 -12.466230 -6.097398 -12.051423 -10.740950 -14.948347
## US VX WN YV
## -11.540872 -14.520286 -10.277807 -10.710622
length(mean_ontime_carrier)
## [1] 16
mean_ontime_carrier[order(mean_ontime_carrier, decreasing = TRUE)]
## HA F9 WN YV OO US
## -6.097398 -8.333046 -10.277807 -10.710622 -10.740950 -11.540872
## MQ EV FL AA B6 DL
## -12.051423 -12.232060 -12.466230 -12.716390 -13.222126 -13.654461
## AS VX UA 9E
## -13.818454 -14.520286 -14.948347 -15.026057
# b). What are the prevalent reasons for delays?
# Answer: Carrier delay, Late Aircraft Delay and NAS Delay are prevalent reasons for delay.
flight1 <- merged.data[!is.na(merged.data$CARRIER_DELAY),] # Eliminate missing values in carrier delays
sum(flight1$CARRIER_DELAY)
## [1] 1564374
sum(flight1$WEATHER_DELAY)
## [1] 233603
sum(flight1$NAS_DELAY)
## [1] 1148931
sum(flight1$SECURITY_DELAY)
## [1] 6304
sum(flight1$LATE_AIRCRAFT_DELAY)
## [1] 1845040
# c). Which airports had the greatest number of weather delays? How about the greatest number of different days with weather delays?
# Answer: Milwaukee, WI: General Mitchell International had the greatest number of delays, which were 1591 minutes.
# Januray 30th had the greatest number of weather delays, which were 5149 minutes.
active_flight <- merged.data[!is.na(merged.data$WEATHER_DELAY),] # Eliminate missing values in weather delays
weather_delay_airport <- active_flight[order(active_flight$WEATHER_DELAY,decreasing=TRUE),]
head(weather_delay_airport[5:7])
## CARRIER ORIGIN_AIRPORT_ID
## 331259 AA 13342
## 422933 MQ 14492
## 109706 DL 11278
## 496084 MQ 15048
## 145045 MQ 11298
## 414980 MQ 14122
## CITY_STATE/COUNTRY_NAME OF AIRPORT
## 331259 Milwaukee, WI: General Mitchell International
## 422933 Raleigh/Durham, NC: Raleigh-Durham International
## 109706 Washington, DC: Ronald Reagan Washington National
## 496084 Sioux City, IA: Sioux Gateway/Col. Bud Day Field
## 145045 Dallas/Fort Worth, TX: Dallas/Fort Worth International
## 414980 Pittsburgh, PA: Pittsburgh International
delay_days <- table(weather_delay_airport$DAY_OF_MONTH)
delay_days
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3654 3927 3758 2197 2099 2618 1751 2308 2260 2245 3006 1874 3671 3204 2204
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 3575 2384 2159 1002 1934 2575 2148 2565 3546 4975 1861 2763 3472 3164 5149
## 31
## 4162
decreasing_delay_days <- delay_days[order(delay_days, decreasing = TRUE)]
decreasing_delay_days
##
## 30 25 31 2 3 13 1 16 24 28 14 29 11 27 6
## 5149 4975 4162 3927 3758 3671 3654 3575 3546 3472 3204 3164 3006 2763 2618
## 21 23 17 8 9 10 15 4 18 22 5 20 12 26 7
## 2575 2565 2384 2308 2260 2245 2204 2197 2159 2148 2099 1934 1874 1861 1751
## 19
## 1002
# Complemental computation
mean_delays_by_carrier <- daply(flight, .(CARRIER),function(df) mean(df$ARR_DELAY, na.rm=T))
mean_delays_by_carrier
## 9E AA AS B6 DL EV
## 3.3237369 3.0548632 -2.4638241 4.7916232 -1.9670318 11.0208181
## F9 FL HA MQ OO UA
## 11.5732803 -3.2131513 1.9762383 7.8210834 5.8476693 -0.3109001
## US VX WN YV
## 1.3495896 -7.3915306 0.5217229 5.3267816
length(mean_delays_by_carrier)
## [1] 16
mean_delays_by_carrier[order(mean_delays_by_carrier, decreasing = TRUE)]
## F9 EV MQ OO YV B6
## 11.5732803 11.0208181 7.8210834 5.8476693 5.3267816 4.7916232
## 9E AA HA US WN UA
## 3.3237369 3.0548632 1.9762383 1.3495896 0.5217229 -0.3109001
## DL AS FL VX
## -1.9670318 -2.4638241 -3.2131513 -7.3915306
# The End