Final Project Data Science Math

Project Description

Analyze two random variables, the arrival and departure delay, from the hflights package for correlation.

Software Preparation

Install package hflights

# install.packages('hflights')

Load libraries

library(hflights)
library(ggplot2)
library(MASS)
library(survival)
library(fitdistrplus)

Load Data To The Envirement

flight_data<-hflights
str(flight_data)

## 'data.frame':    227496 obs. of  21 variables:
##  $ Year             : int  2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ Month            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ DayofMonth       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ DayOfWeek        : int  6 7 1 2 3 4 5 6 7 1 ...
##  $ DepTime          : int  1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
##  $ ArrTime          : int  1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
##  $ UniqueCarrier    : chr  "AA" "AA" "AA" "AA" ...
##  $ FlightNum        : int  428 428 428 428 428 428 428 428 428 428 ...
##  $ TailNum          : chr  "N576AA" "N557AA" "N541AA" "N403AA" ...
##  $ ActualElapsedTime: int  60 60 70 70 62 64 70 59 71 70 ...
##  $ AirTime          : int  40 45 48 39 44 45 43 40 41 45 ...
##  $ ArrDelay         : int  -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
##  $ DepDelay         : int  0 1 -8 3 5 -1 -1 -5 43 43 ...
##  $ Origin           : chr  "IAH" "IAH" "IAH" "IAH" ...
##  $ Dest             : chr  "DFW" "DFW" "DFW" "DFW" ...
##  $ Distance         : int  224 224 224 224 224 224 224 224 224 224 ...
##  $ TaxiIn           : int  7 6 5 9 9 6 12 7 8 6 ...
##  $ TaxiOut          : int  13 9 17 22 9 13 15 12 22 19 ...
##  $ Cancelled        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CancellationCode : chr  "" "" "" "" ...
##  $ Diverted         : int  0 0 0 0 0 0 0 0 0 0 ...

Get the arrival and departure delays data into variables with NA omitted and print a summary

ArrDelay_y<-na.omit(flight_data$ArrDelay)
summary(ArrDelay_y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -70.000  -8.000   0.000   7.094  11.000 978.000

DepDelay_X<-na.omit(flight_data$DepDelay)
summary(DepDelay_X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -33.000  -3.000   0.000   9.445   9.000 981.000

Plot The Variables

hist(ArrDelay_y, main = "Arrival Delay", xlab = "ArrDelay ", ylab = "Frequency",
                                         xlim = c(0, 1000), breaks = 20, border = "blue")

hist(DepDelay_X, main = "Departure Delay", xlab = "Departure ", ylab = "Frequency",
                                         xlim = c(0, 1000), breaks = 20, border = "orange")

Probability analysis

Find the probabilities for a given table

#get the count of Py>3d & px>0
CountArrDelay_HH<-nrow(subset(hflights, ArrDelay 
                              > quantile(ArrDelay_y,0.75) & DepDelay > 0.000))
CountArrDelay_HH

## [1] 48548

#P(X>x | Y>y) 
phh<-(CountArrDelay_HH/length(DepDelay_X))/0.5
phh

## [1] 0.4323236

#P(X>x,Y>y)
pm=0.25*0.5
pm

## [1] 0.125

#get the count of Py<3d & px>0
CountArrDelay_LH<-nrow(subset(hflights, ArrDelay
                              < quantile(ArrDelay_y,0.75) & DepDelay > 0.000))
CountArrDelay_LH

## [1] 58780

#P(X<x,Y>y)
plh=(CountArrDelay_LH/length(DepDelay_X))/0.5
plh

## [1] 0.5234404

#get the count of Py>3d & px<0
CountArrDelay_LL<-nrow(subset(hflights, ArrDelay 
                              < quantile(ArrDelay_y,0.75) & DepDelay < 0.000))
CountArrDelay_LL

## [1] 90992

#P(X<x,Y<y)
pll=(CountArrDelay_LL/length(DepDelay_X))/0.5
pll

## [1] 0.8102907

#get the count of Py<=3d & px<=0
CountArrDelay_LLe<-nrow(subset(hflights, ArrDelay 
                               <= quantile(ArrDelay_y,0.75) & DepDelay <= 0.000))
CountArrDelay_LLe

## [1] 108141

#P(X<=x,Y<=y)
plle=(CountArrDelay_LLe/length(DepDelay_X))
plle

## [1] 0.4815019

#get the count of Py<=3d & px>0
CountArrDelay_LH_e<-nrow(subset(hflights, ArrDelay
                                <= quantile(ArrDelay_y,0.75) & DepDelay > 0.000))
CountArrDelay_LH_e

## [1] 61026

#P(X<=x,Y>y)
plh_e=(CountArrDelay_LH_e/length(DepDelay_X))
plh_e

## [1] 0.2717206

#get the count of Py>3d & px<=0
CountArrDelay_HL_e<-nrow(subset(hflights, ArrDelay
                                > quantile(ArrDelay_y,0.75) & DepDelay <= 0.000))
CountArrDelay_HL_e

## [1] 6159

#P(X>x,Y<=y)
phl_e=(CountArrDelay_HL_e/length(DepDelay_X))
phl_e

## [1] 0.02742318

#P(X>x,Y>y)
ph_h=(CountArrDelay_HH/length(DepDelay_X))
ph_h

## [1] 0.2161618

resultTable = matrix(c(plle,plh_e,(plle+plh_e),phl_e,ph_h,(phl_e+ph_h),
                       (plle+phl_e),(plh_e+ph_h),((plle+plh_e)+(phl_e+ph_h)) ),
                     ncol=3, nrow=3,byrow=TRUE)
colnames(resultTable) = c("<=2d quartile",">2d quartile","Total")
rownames(resultTable) = c('<=3d quartile', '>3d quartile','Total')
resultTable.table = as.table(resultTable)
round(resultTable.table,3)

##               <=2d quartile >2d quartile Total
## <=3d quartile         0.482        0.272 0.753
## >3d quartile          0.027        0.216 0.244
## Total                 0.509        0.488 0.997

#p(A)
CountArrDelay<-nrow(subset(hflights, ArrDelay > quantile(ArrDelay_y,0.75)))
CountArrDelay

## [1] 54707

pa=CountArrDelay/length(DepDelay_X)
pa

## [1] 0.243585

length(CountArrDelay)

## [1] 1

#P(B)

CountDepDelay<-nrow(subset(hflights, DepDelay >0.00 &DepDelay <9.00))
CountDepDelay

## [1] 51153

CountDepDelay<-nrow(subset(hflights, DepDelay >0.00))
CountDepDelay

## [1] 109996

pb=CountDepDelay/length(DepDelay_X)
pb

## [1] 0.4897614

#P(A)*P(B)
pab=pa*pb
pab

## [1] 0.1192985

As results P(A|B) != P(A)P(B)

before Chisquare we need to clean the vectores with equal length

get_na_omittedData=na.omit(flight_data)

Chisquare test

chisq.test(get_na_omittedData$ArrDelay,get_na_omittedData$DepDelay,correct = FALSE)

## Warning in chisq.test(get_na_omittedData$ArrDelay, get_na_omittedData
## $DepDelay, : Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  get_na_omittedData$ArrDelay and get_na_omittedData$DepDelay
## X-squared = 24824000, df = 197270, p-value < 2.2e-16

P < 0.05 rejects the null hypothesis that is no relashinship between arrival and departure

lets take the t test and compare it to Chisquare result

t.test(flight_data$DepDelay,flight_data$ArrDelay)

## 
##  Welch Two Sample t-test
## 
## data:  flight_data$DepDelay and flight_data$ArrDelay
## t = 26.436, df = 446450, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  2.176342 2.524892
## sample estimates:
## mean of x mean of y 
##  9.444951  7.094334

Descriptive and Inferential Statistics

Provide univariate descriptive statistics and appropriate plots.

cor_matrix=matrix(c(1.0,0.93,0.93,1), ncol =2)

#converting the matrix 

m<-solve(cor_matrix)
m

##           [,1]      [,2]
## [1,]  7.401925 -6.883790
## [2,] -6.883790  7.401925

mi1<-cor_matrix%*%m
mi1

##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

mi2<-m%*%cor_matrix
mi2

##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

cor.test(get_na_omittedData$ArrDelay,get_na_omittedData$DepDelay,use="complete.obs")

## 
##  Pearson's product-moment correlation
## 
## data:  get_na_omittedData$ArrDelay and get_na_omittedData$DepDelay
## t = 1189.8, df = 223870, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9286503 0.9297816
## sample estimates:
##       cor 
## 0.9292181

The result of the correlations shows a strong relationship with percentile greater tha 92%

Scatter Plot

Plot the departure delay verses the arrival delay to observe the relationship

y=get_na_omittedData$ArrDelay
x=get_na_omittedData$DepDelay

ggplot(get_na_omittedData, aes(x,y))+
    geom_point(shape=1) + 
    geom_jitter(aes(colour = y))+
    labs(title = "Arrival Delay vs Departure Delay")+
    xlab("Departure Delay") +   
    ylab("Arrival Delay") +
    geom_smooth(method=lm)

From the graph and the linear correlation we can conclude that their is a relashinship between the two variables

Calculus-Based Probability & Statistics

y=y+70
y.exp <- rexp(1000, 1.297112e-02)
#Histogram of the original X variable (ArrDelay) before fit
hist(y.exp, main = "Arrival Delay", xlab = "ArrDelay ", ylab = "Frequency",
                                        xaxp=c(-50,600,20), border = "green")

# get the 5th Percentile
qexp(.05,rate=.297112e-02)

## [1] 17.26396

#get the 95%
qexp(.95,rate=.297112e-02)

## [1] 1008.284

sd(ArrDelay_y)

## [1] 30.70852

e<-qt(0.975,df=length(ArrDelay_y)-1)*sd(ArrDelay_y)/sqrt(length(ArrDelay_y))

l <- mean(ArrDelay_y)-e
r <- mean(ArrDelay_y)+e

quantile(ArrDelay_y, prob=c(0.05,0.95))

##  5% 95% 
## -18  57

Conclusion

The correlation of 93% and the small P value shows a strong relationship between the two variables. The analysis indicates that departure delay is related to the cause of arrival delay. Looking at the graph, from the condensed congestion of the jitter points we can also determine that there is strong relationship.