Independent_Study_Yanxin

############ Independent Study_Yanxin Li ################
######## Work Along - Vectors ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/3_Vectors/work_along_data_S3.R")
# What is the length of x? 47
length(x)

## [1] 47

# What is the class of x? numeric
class(x)

## [1] "numeric"

# What is the name of the 4th entry in x? N4
names(x[4])

## [1] "N4"

# What are the mean, median and 3rd quartile of x? 1.38500
summary(x)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.15900 -1.24300  0.25190 -0.04026  1.38500  3.08500

# What are the possible values for an entry of y? 0, 1
y1 <- as.numeric(y)
y1

##  [1] 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1

# How many of each value are there? 
sum(y) # number of 1 (12)

## [1] 12

length(y)-sum(y) # number of 0 (6)

## [1] 6

# How many entries in x are less than 0? 21
sum(x < 0)

## [1] 21

# Form a new vector of all entries in x greater than the median
x.sub <- subset(x, x > median(x))
x.sub

##        N1        N2        N7        N8       N10       N13       N14 
## 0.9886213 1.0515351 1.4071772 0.3987219 1.7079851 2.3304798 1.8955924 
##       N17       N19       N20       N21       N22       N23       N26 
## 0.7890702 2.8608310 2.1563374 1.3703668 0.6488079 1.3986529 1.1803058 
##       N28       N29       N33       N35       N39       N41       N43 
## 0.4512312 1.0588708 1.7227900 0.6428734 1.7454097 3.0852416 0.3280050 
##       N46       N47 
## 1.5686945 3.0681204

# What is the sum of v1? 28
sum(v1)

## [1] 28

# What is the sum of v2? 17
sum(v2, na.rm = TRUE)

## [1] 17

######## Work Along - Matrices ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/4_Matrices/work_along_data_S4.R")
# How many rows and columns are in M1? row:2, column:6
nrow(M1)

## [1] 2

ncol(M1)

## [1] 6

# Rearrange the underlying entries of M1 as a 4 x 3 matrix
M1.rea <- matrix(1:12,4,3)

# Replace the NA values in M2 by 0's
M2

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1   NA    9   13   17
## [2,]    2    6   10   NA   18
## [3,]    3    7   11   15   NA
## [4,]    4    8   12   16   20

M2[1,2] = 0; M2[2,4] = 0; M2[3,5] = 0

# Form a matrix that binds the columns of M1 and the matrix z
# Check that the dimensions match first
M3 <- cbind(M1,z); M3

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "1"  "3"  "5"  "7"  "9"  "11" "a"  "a" 
## [2,] "2"  "4"  "6"  "8"  "10" "12" "a"  "a"

M4 <- rbind(t(M1),z); M4

##      [,1] [,2]
## [1,] "1"  "2" 
## [2,] "3"  "4" 
## [3,] "5"  "6" 
## [4,] "7"  "8" 
## [5,] "9"  "10"
## [6,] "11" "12"
## [7,] "a"  "a" 
## [8,] "a"  "a"

dim(M1); dim(z)

## [1] 2 6

## [1] 2 2

nrow(z); ncol(z)

## [1] 2

## [1] 2

# What happened to the numbers?
dim(M3); dim(M4)

## [1] 2 8

## [1] 8 2

######## Work Along - Data Frames ########
# Display the variables and variable classes for the data.frame mtcars
mtcars

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

class(mtcars)

## [1] "data.frame"

class(mtcars[3:7,4])

## [1] "numeric"

class(mtcars[4:8,3:10])

## [1] "data.frame"

head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

dim(mtcars)

## [1] 32 11

# How many cars are there for each possible choice of number of cylinders.
sum(mtcars$cyl == 4) # 11

## [1] 11

sum(mtcars$cyl == 6) # 7

## [1] 7

sum(mtcars$cyl == 8) # 14

## [1] 14

# Create a new data.frame that adds a column for the new computed variable mpg/cyl
mpg.cyl <- mtcars$mpg/mtcars$cyl
mtcars.new <- data.frame(mtcars, mpg.cyl)
head(mtcars.new)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
##                    mpg.cyl
## Mazda RX4         3.500000
## Mazda RX4 Wag     3.500000
## Datsun 710        5.700000
## Hornet 4 Drive    3.566667
## Hornet Sportabout 2.337500
## Valiant           3.016667

# What is the mean horsepower (hp) for the cars with 4 gears?
data <- mtcars[mtcars$gear == 4,]
mean(data$hp)

## [1] 89.5

######## Work along - Loops, etc ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/7_Loops_etc/work_along_data_S7.R")
# Compute a sequence r of length 100 such that r[1] = 0 and r[i+1] is randomly sampled from a normal distribution with mean r[i] and standard deviation 1. 
# Inspect the first 5 entries and the last 5 entries. Any patterns?
r <- c()
r[1] = 0
for(i in 1:99){
  r[i+1] = rnorm(1,r[i],1)
}
r[1:5]; tail(r, n=5); class(r[1:7])

## [1]  0.0000000 -0.4293217  1.1315557  1.9354511  1.8365423

## [1] 0.5505827 0.9085822 1.2888064 0.6252272 1.0861533

## [1] "numeric"

# prob_L is list of character vectors. Use a for loop to define a vector p1 such that p1[j] is the first entry of the \(j^{th}\) component of prob_L.
prob_L

## [[1]]
## [1] "word"  "word2"
## 
## [[2]]
## [1] "A" "2"
## 
## [[3]]
## [1] "8"
## 
## [[4]]
## [1] FALSE
## 
## [[5]]
## [1] "yes"      "no"       "maybe-so"

prob_L[[1]][1]

## [1] "word"

p1 <- c()
for (i in 1:5){
  p1[i] <- prob_L[[i]][1]
}
p1

## [1] "word"  "A"     "8"     "FALSE" "yes"

# Create a vector where the \(j-\)entry is the second entry of the \(j^{th}\) component of prob_L.
p2 <- c()
for (i in 1:5){
  p2[i] <- prob_L[[i]][2]
}
p2 <- p2[c(1,2,5)]
p2

## [1] "word2" "2"     "no"

######## Work along - Defining functions ########
# Write a function of one variable that for an input numeric vector, computes the mean of the samples above the median.
# Check that the input is a numeric vector and if it isn't, 
# return an appropriate message about the wrong function argument.
u <- runif(500)
median(u)

## [1] 0.4883122

mean_u <- c()
for (i in 1:500){
  if(median(u)<u[i])
    mean_u[i] = u[i]
  else if (median(u)>=u[i])
    mean_u[i] = 0
}
mean(mean_u)

## [1] 0.3727057

class(u)

## [1] "numeric"

class(mean_u)

## [1] "numeric"

# Write a function that given a data.frame, reports the numbers of columns of each possible class.
# The possible classes are numeric, integer, character, logical, factor
setwd('D:\\R Code')
Precip = read.table("Precip.txt")
colnames(Precip) <- c("Coast","City","State","April","May","Se","Xe","Xw","Logic")
Precip$City <- as.character(Precip$City)
Precip

##    Coast         City State April May Se  Xe  Xw Logic
## 1   East       Albany    NY   2.9 3.3  1 2.9 0.0  TRUE
## 2   East   Washington    DC   3.1 3.6  1 3.1 0.0  TRUE
## 3   East Jacksonville    FL   3.3 4.9  1 3.3 0.0  TRUE
## 4   East      Raleigh    NC   2.9 3.7  1 2.9 0.0  TRUE
## 5   East   Burlington    VT   2.8 3.0  1 2.8 0.0  TRUE
## 6   West  Los_Angeles    CA   1.2 0.2  0 0.0 1.2 FALSE
## 7   West      Seattle    WA   2.4 1.6  0 0.0 2.4 FALSE
## 8   West     Portland    OR   2.3 2.1  0 0.0 2.3 FALSE
## 9   West    San_Diego    CA   2.6 1.5  0 0.0 2.6 FALSE
## 10  West       Fresno    CA   1.2 0.3  0 0.0 1.2 FALSE

str(Precip)

## 'data.frame':    10 obs. of  9 variables:
##  $ Coast: Factor w/ 2 levels "East","West": 1 1 1 1 1 2 2 2 2 2
##  $ City : chr  "Albany" "Washington" "Jacksonville" "Raleigh" ...
##  $ State: Factor w/ 8 levels "CA","DC","FL",..: 5 2 3 4 7 1 8 6 1 1
##  $ April: num  2.9 3.1 3.3 2.9 2.8 1.2 2.4 2.3 2.6 1.2
##  $ May  : num  3.3 3.6 4.9 3.7 3 0.2 1.6 2.1 1.5 0.3
##  $ Se   : int  1 1 1 1 1 0 0 0 0 0
##  $ Xe   : num  2.9 3.1 3.3 2.9 2.8 0 0 0 0 0
##  $ Xw   : num  0 0 0 0 0 1.2 2.4 2.3 2.6 1.2
##  $ Logic: logi  TRUE TRUE TRUE TRUE TRUE FALSE ...

dim(Precip)

## [1] 10  9

n = ncol(Precip)
n

## [1] 9

myfun <- function(Precip){
  vec = rep()
  for(i in 1:n ){
    vec[i] = class(Precip[,i])
  }
  return(table(vec))
}
table <- myfun(Precip)
table

## vec
## character    factor   integer   logical   numeric 
##         1         2         1         1         4

######## Work along - Functions on matrices and lists ########
source("http://www3.nd.edu/~steve/computing_with_data_2014/9_Functions_matrices_lists/work_along_data_S9.R")
# Compute the sums of the non-negative values of each column of mat2.
mat1 <- c()
class(mat2)

## [1] "matrix"

s <- matrix(NA,5,6)
for(j in 1:6){
  for(i in 1:5){
    if(mat2[i,j]>=0)
      s[i,j] <- mat2[i,j]
    else if(mat2[i,j]<2)
      s[i,j] <- 0
  }
  mat1[j] <-sum(s[,j])
}
# output
s

##           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## [1,] 0.2352207 0.1402782 0.0000000 0.0000000 0.3616625 0.5982542
## [2,] 0.0000000 0.0000000 0.0000000 0.0000000 0.3469644 0.0000000
## [3,] 0.0000000 0.0000000 1.8456363 0.0000000 0.1897365 2.7180556
## [4,] 0.0000000 0.0000000 0.3940541 0.0000000 0.0000000 0.1912444
## [5,] 0.0000000 0.0000000 0.7975285 0.3641867 0.3265492 0.0000000

mat1

## [1] 0.2352207 0.1402782 3.0372189 0.3641867 1.2249126 3.5075542

######## Working with Data frames_Ontime flight data ########
setwd('D:\\R Code')
air_name <- read.csv(file="air_carrier_names.csv",header=TRUE)
codes <- read.csv(file="airport_codes.csv",header=TRUE)
ontime <- read.csv(file="ONTIME1.csv",header=TRUE)
carrier <- read.csv(file="CARRIER.csv",header=TRUE)
# 1. Are there any flights with missing CARRIER information? 
# Answer: No
sum(is.na(ontime$CARRIER))

## [1] 0

# 2. Are there any flights with carrier codes that aren't found in the carrier look-up table? 
# Answer: No
a <- levels(ontime$CARRIER)
b <- levels(carrier$Code)
a %in% b

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE

# 3. How many airlines had a flight in January 2013 in this database? HINT: Use unique
# Answer: 16
num.airlines <- unique(ontime$AIRLINE_ID)
num.airlines

##  [1] 20363 19805 19930 20409 19790 20366 20436 20437 19690 20398 20304
## [12] 19977 20355 21171 19393 20378

length(num.airlines)

## [1] 16

# 4. Carefully read the help file for the merge function. Merge the ontime flight data 
#    with the airport codes to create a text field description of the origininating airport 
#    to the ontime flight data.frame. You'll need to specify the columns in the ontime flight 
#    data.frame and the airport code data.frame that you want to match for the merge operation.
#    Also, set the name of the airport description column to a name that clearly describes what it is. 
#    When done, use the str command to exhibit the characteristics of the new data.frame. 
#    Note that this didn't introduce any new records.
c <- unique(ontime$ORIGIN_AIRPORT_ID)
c %in% codes$Code

##   [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##  [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [183] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [197] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [211] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [225] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [239] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [253] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [267] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [281] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [295] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

length(c)

## [1] 306

merged.data <- merge(ontime, codes, by.x="ORIGIN_AIRPORT_ID", by.y="Code")
merged.data <- merged.data[,c(2,3,4,5,6,1,19,7,8,9,10,11,12,13,14,15,16,17,18)]
names(merged.data)[7]<-paste("CITY_STATE/COUNTRY_NAME OF AIRPORT")
str(merged.data)

## 'data.frame':    509519 obs. of  19 variables:
##  $ YEAR                              : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ MONTH                             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DAY_OF_MONTH                      : int  15 22 20 22 21 28 17 11 28 10 ...
##  $ AIRLINE_ID                        : int  20366 20378 20366 20366 20366 20366 20366 20366 20366 20366 ...
##  $ CARRIER                           : Factor w/ 16 levels "9E","AA","AS",..: 6 16 6 6 6 6 6 6 6 6 ...
##  $ ORIGIN_AIRPORT_ID                 : int  10135 10135 10135 10135 10135 10135 10135 10135 10135 10135 ...
##  $ CITY_STATE/COUNTRY_NAME OF AIRPORT: Factor w/ 6232 levels "47-Mile Mine, AK: 47-Mile Mine Airport",..: 134 134 134 134 134 134 134 134 134 134 ...
##  $ ORIGIN_AIRPORT_SEQ_ID             : int  1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 1013503 ...
##  $ ORIGIN_CITY_MARKET_ID             : int  30135 30135 30135 30135 30135 30135 30135 30135 30135 30135 ...
##  $ DEST_AIRPORT_ID                   : int  13930 11057 10397 13930 13930 13930 13930 13930 13930 13930 ...
##  $ DEST_AIRPORT_SEQ_ID               : int  1393002 1105703 1039705 1393002 1393002 1393002 1393002 1393002 1393002 1393002 ...
##  $ DEST_CITY_MARKET_ID               : int  30977 31057 30397 30977 30977 30977 30977 30977 30977 30977 ...
##  $ ARR_TIME                          : int  1257 1148 840 1321 1318 1937 2026 1240 1253 1226 ...
##  $ ARR_DELAY                         : int  7 72 -17 31 28 12 61 -10 3 -24 ...
##  $ CARRIER_DELAY                     : int  NA 72 NA 0 6 NA 2 NA NA NA ...
##  $ WEATHER_DELAY                     : int  NA 0 NA 0 0 NA 0 NA NA NA ...
##  $ NAS_DELAY                         : int  NA 0 NA 31 7 NA 0 NA NA NA ...
##  $ SECURITY_DELAY                    : int  NA 0 NA 0 0 NA 0 NA NA NA ...
##  $ LATE_AIRCRAFT_DELAY               : int  NA 0 NA 0 15 NA 59 NA NA NA ...

# 5. Use the lookup table of carrier codes to find the code for United Airlines (You can do that in Excel). 
# Create a sub-data.frame containing all United Airlines flights.
UA <- subset.data.frame(merged.data, CARRIER == "UA")

# 6. There isn't a field exactly specifying in a "yes" or "no" whether there was a "DELAY", 
#    so I'd like you to create one. Read the descriptions of the fields on the above website, 
#    look at samples of the records, and decide what property of one of the existing fields 
#    characterizes when there is a delay. Then create a new data.frame of the United Airlines 
#    flights with a new column that is a logical vector saying whether there is or isn't an official delay.
# Answer: Arrival delay: 15 minutes or more = TRUE
delay <- (UA$ARR_DELAY >= 15)
UAir <- cbind(UA,OFFICIAL_DELAY=delay)
sum(is.na(delay))

## [1] 300

sum(delay,na.rm = TRUE)

## [1] 6634

# 7. What percentage of United Airlines flight had a delay?
# Answer: About 16.4 percent of United Airlines has a delay.
percentage = sum(delay,na.rm = TRUE)/(length(UAir$YEAR - sum(is.na(delay))))
percentage

## [1] 0.1641307

# 8. Sort the data.frame generated in 5 by the amount of delay in decreasing order. HINT: use the order function.
UAIR <- UAir[order(UAir$ARR_DELAY, decreasing = TRUE),]

# 9. Which originating airports had the 3 longest delays for United Airlines in January 2013?
# Answer: No.1 January 24th, 13830, Kahului, HI: Kahului Airport   1017 minutes
#         No.2 January 16th, 13930, Chicago, IL: Chicago O'Hare International  565 minutes
#         No.3 January 3rd, 13930, Chicago, IL: Chicago O'Hare International  534 minutes 

# 10. In doing this project, many natural questions should have occurred to you. Here are a few.
# a). Which airlines had the best on-time performance record?
# Answer: WN Southwest Airlines had the best on-time performance, 57378 times of flights were on-time.
flight <- merged.data[!is.na(merged.data$ARR_DELAY),] # Eliminate missing values in arrival delays
ontime_airlines <- subset(flight, flight$ARR_DELAY <= 0) # Subsetting on-time performance airlines
on_time <- table(ontime_airlines$CARRIER)
on_time

## 
##    9E    AA    AS    B6    DL    EV    F9    FL    HA    MQ    OO    UA 
## 15236 27224  8020 11588 41020 31996  2321 11149  3881 21955 29612 26504 
##    US    VX    WN    YV 
## 20931  3352 57378  6590

ontime_record <- as.matrix(on_time)
decreasing_ontime_record <- ontime_record[order(ontime_record, decreasing = TRUE)]
decreasing_ontime_record

##  [1] 57378 41020 31996 29612 27224 26504 21955 20931 15236 11588 11149
## [12]  8020  6590  3881  3352  2321

# If we evaluate the best on-time performance based on the average time, we conclude that carrier 9E:Allentown/Bethlehem/Easton, PA: Lehigh Valley International ranked the first.
library(plyr)
mean_ontime_carrier <- daply(ontime_airlines, .(CARRIER),function(df) mean(df$ARR_DELAY, na.rm=T))
mean_ontime_carrier

##         9E         AA         AS         B6         DL         EV 
## -15.026057 -12.716390 -13.818454 -13.222126 -13.654461 -12.232060 
##         F9         FL         HA         MQ         OO         UA 
##  -8.333046 -12.466230  -6.097398 -12.051423 -10.740950 -14.948347 
##         US         VX         WN         YV 
## -11.540872 -14.520286 -10.277807 -10.710622

length(mean_ontime_carrier)

## [1] 16

mean_ontime_carrier[order(mean_ontime_carrier, decreasing = TRUE)]

##         HA         F9         WN         YV         OO         US 
##  -6.097398  -8.333046 -10.277807 -10.710622 -10.740950 -11.540872 
##         MQ         EV         FL         AA         B6         DL 
## -12.051423 -12.232060 -12.466230 -12.716390 -13.222126 -13.654461 
##         AS         VX         UA         9E 
## -13.818454 -14.520286 -14.948347 -15.026057

# b). What are the prevalent reasons for delays?
# Answer: Carrier delay, Late Aircraft Delay and NAS Delay are prevalent reasons for delay.
flight1 <- merged.data[!is.na(merged.data$CARRIER_DELAY),] # Eliminate missing values in carrier delays
sum(flight1$CARRIER_DELAY)

## [1] 1564374

sum(flight1$WEATHER_DELAY)

## [1] 233603

sum(flight1$NAS_DELAY)

## [1] 1148931

sum(flight1$SECURITY_DELAY)

## [1] 6304

sum(flight1$LATE_AIRCRAFT_DELAY)

## [1] 1845040

# c). Which airports had the greatest number of weather delays? How about the greatest number of different days with weather delays?
# Answer:  Milwaukee, WI: General Mitchell International had the greatest number of delays, which were 1591 minutes.
#          Januray 30th had the greatest number of weather delays, which were 5149 minutes. 
active_flight <- merged.data[!is.na(merged.data$WEATHER_DELAY),] # Eliminate missing values in weather delays
weather_delay_airport <- active_flight[order(active_flight$WEATHER_DELAY,decreasing=TRUE),]
head(weather_delay_airport[5:7])

##        CARRIER ORIGIN_AIRPORT_ID
## 331259      AA             13342
## 422933      MQ             14492
## 109706      DL             11278
## 496084      MQ             15048
## 145045      MQ             11298
## 414980      MQ             14122
##                            CITY_STATE/COUNTRY_NAME OF AIRPORT
## 331259          Milwaukee, WI: General Mitchell International
## 422933       Raleigh/Durham, NC: Raleigh-Durham International
## 109706      Washington, DC: Ronald Reagan Washington National
## 496084       Sioux City, IA: Sioux Gateway/Col. Bud Day Field
## 145045 Dallas/Fort Worth, TX: Dallas/Fort Worth International
## 414980               Pittsburgh, PA: Pittsburgh International

delay_days <- table(weather_delay_airport$DAY_OF_MONTH)
delay_days

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 3654 3927 3758 2197 2099 2618 1751 2308 2260 2245 3006 1874 3671 3204 2204 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
## 3575 2384 2159 1002 1934 2575 2148 2565 3546 4975 1861 2763 3472 3164 5149 
##   31 
## 4162

decreasing_delay_days <- delay_days[order(delay_days, decreasing = TRUE)]
decreasing_delay_days

## 
##   30   25   31    2    3   13    1   16   24   28   14   29   11   27    6 
## 5149 4975 4162 3927 3758 3671 3654 3575 3546 3472 3204 3164 3006 2763 2618 
##   21   23   17    8    9   10   15    4   18   22    5   20   12   26    7 
## 2575 2565 2384 2308 2260 2245 2204 2197 2159 2148 2099 1934 1874 1861 1751 
##   19 
## 1002

# Complemental computation
mean_delays_by_carrier <- daply(flight, .(CARRIER),function(df) mean(df$ARR_DELAY, na.rm=T))
mean_delays_by_carrier

##         9E         AA         AS         B6         DL         EV 
##  3.3237369  3.0548632 -2.4638241  4.7916232 -1.9670318 11.0208181 
##         F9         FL         HA         MQ         OO         UA 
## 11.5732803 -3.2131513  1.9762383  7.8210834  5.8476693 -0.3109001 
##         US         VX         WN         YV 
##  1.3495896 -7.3915306  0.5217229  5.3267816

length(mean_delays_by_carrier)

## [1] 16

mean_delays_by_carrier[order(mean_delays_by_carrier, decreasing = TRUE)]

##         F9         EV         MQ         OO         YV         B6 
## 11.5732803 11.0208181  7.8210834  5.8476693  5.3267816  4.7916232 
##         9E         AA         HA         US         WN         UA 
##  3.3237369  3.0548632  1.9762383  1.3495896  0.5217229 -0.3109001 
##         DL         AS         FL         VX 
## -1.9670318 -2.4638241 -3.2131513 -7.3915306

# The End

Independent_Study_Yanxin_Li.R

Shuchen

Thu Dec 10 23:57:03 2015