Analyze two random variables, the arrival and departure delay, from the hflights package for correlation.
Install package hflights
# install.packages('hflights')Load libraries
library(hflights)
library(ggplot2)
library(MASS)
library(survival)
library(fitdistrplus)flight_data<-hflights
str(flight_data)## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
Get the arrival and departure delays data into variables with NA omitted and print a summary
ArrDelay_y<-na.omit(flight_data$ArrDelay)
summary(ArrDelay_y)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -70.000 -8.000 0.000 7.094 11.000 978.000
DepDelay_X<-na.omit(flight_data$DepDelay)
summary(DepDelay_X)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33.000 -3.000 0.000 9.445 9.000 981.000
hist(ArrDelay_y, main = "Arrival Delay", xlab = "ArrDelay ", ylab = "Frequency",
xlim = c(0, 1000), breaks = 20, border = "blue")hist(DepDelay_X, main = "Departure Delay", xlab = "Departure ", ylab = "Frequency",
xlim = c(0, 1000), breaks = 20, border = "orange")Find the probabilities for a given table
#get the count of Py>3d & px>0
CountArrDelay_HH<-nrow(subset(hflights, ArrDelay
> quantile(ArrDelay_y,0.75) & DepDelay > 0.000))
CountArrDelay_HH## [1] 48548
#P(X>x | Y>y)
phh<-(CountArrDelay_HH/length(DepDelay_X))/0.5
phh## [1] 0.4323236
#P(X>x,Y>y)
pm=0.25*0.5
pm ## [1] 0.125
#get the count of Py<3d & px>0
CountArrDelay_LH<-nrow(subset(hflights, ArrDelay
< quantile(ArrDelay_y,0.75) & DepDelay > 0.000))
CountArrDelay_LH## [1] 58780
#P(X<x,Y>y)
plh=(CountArrDelay_LH/length(DepDelay_X))/0.5
plh ## [1] 0.5234404
#get the count of Py>3d & px<0
CountArrDelay_LL<-nrow(subset(hflights, ArrDelay
< quantile(ArrDelay_y,0.75) & DepDelay < 0.000))
CountArrDelay_LL## [1] 90992
#P(X<x,Y<y)
pll=(CountArrDelay_LL/length(DepDelay_X))/0.5
pll ## [1] 0.8102907
#get the count of Py<=3d & px<=0
CountArrDelay_LLe<-nrow(subset(hflights, ArrDelay
<= quantile(ArrDelay_y,0.75) & DepDelay <= 0.000))
CountArrDelay_LLe## [1] 108141
#P(X<=x,Y<=y)
plle=(CountArrDelay_LLe/length(DepDelay_X))
plle ## [1] 0.4815019
#get the count of Py<=3d & px>0
CountArrDelay_LH_e<-nrow(subset(hflights, ArrDelay
<= quantile(ArrDelay_y,0.75) & DepDelay > 0.000))
CountArrDelay_LH_e## [1] 61026
#P(X<=x,Y>y)
plh_e=(CountArrDelay_LH_e/length(DepDelay_X))
plh_e ## [1] 0.2717206
#get the count of Py>3d & px<=0
CountArrDelay_HL_e<-nrow(subset(hflights, ArrDelay
> quantile(ArrDelay_y,0.75) & DepDelay <= 0.000))
CountArrDelay_HL_e## [1] 6159
#P(X>x,Y<=y)
phl_e=(CountArrDelay_HL_e/length(DepDelay_X))
phl_e ## [1] 0.02742318
#P(X>x,Y>y)
ph_h=(CountArrDelay_HH/length(DepDelay_X))
ph_h ## [1] 0.2161618
resultTable = matrix(c(plle,plh_e,(plle+plh_e),phl_e,ph_h,(phl_e+ph_h),
(plle+phl_e),(plh_e+ph_h),((plle+plh_e)+(phl_e+ph_h)) ),
ncol=3, nrow=3,byrow=TRUE)
colnames(resultTable) = c("<=2d quartile",">2d quartile","Total")
rownames(resultTable) = c('<=3d quartile', '>3d quartile','Total')
resultTable.table = as.table(resultTable)
round(resultTable.table,3)## <=2d quartile >2d quartile Total
## <=3d quartile 0.482 0.272 0.753
## >3d quartile 0.027 0.216 0.244
## Total 0.509 0.488 0.997
#p(A)
CountArrDelay<-nrow(subset(hflights, ArrDelay > quantile(ArrDelay_y,0.75)))
CountArrDelay## [1] 54707
pa=CountArrDelay/length(DepDelay_X)
pa## [1] 0.243585
length(CountArrDelay)## [1] 1
#P(B)
CountDepDelay<-nrow(subset(hflights, DepDelay >0.00 &DepDelay <9.00))
CountDepDelay## [1] 51153
CountDepDelay<-nrow(subset(hflights, DepDelay >0.00))
CountDepDelay## [1] 109996
pb=CountDepDelay/length(DepDelay_X)
pb## [1] 0.4897614
#P(A)*P(B)
pab=pa*pb
pab## [1] 0.1192985
As results P(A|B) != P(A)P(B)
before Chisquare we need to clean the vectores with equal length
get_na_omittedData=na.omit(flight_data)Chisquare test
chisq.test(get_na_omittedData$ArrDelay,get_na_omittedData$DepDelay,correct = FALSE)## Warning in chisq.test(get_na_omittedData$ArrDelay, get_na_omittedData
## $DepDelay, : Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: get_na_omittedData$ArrDelay and get_na_omittedData$DepDelay
## X-squared = 24824000, df = 197270, p-value < 2.2e-16
P < 0.05 rejects the null hypothesis that is no relashinship between arrival and departure
lets take the t test and compare it to Chisquare result
t.test(flight_data$DepDelay,flight_data$ArrDelay)##
## Welch Two Sample t-test
##
## data: flight_data$DepDelay and flight_data$ArrDelay
## t = 26.436, df = 446450, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.176342 2.524892
## sample estimates:
## mean of x mean of y
## 9.444951 7.094334
Provide univariate descriptive statistics and appropriate plots.
cor_matrix=matrix(c(1.0,0.93,0.93,1), ncol =2)
#converting the matrix
m<-solve(cor_matrix)
m## [,1] [,2]
## [1,] 7.401925 -6.883790
## [2,] -6.883790 7.401925
mi1<-cor_matrix%*%m
mi1## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
mi2<-m%*%cor_matrix
mi2## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
cor.test(get_na_omittedData$ArrDelay,get_na_omittedData$DepDelay,use="complete.obs")##
## Pearson's product-moment correlation
##
## data: get_na_omittedData$ArrDelay and get_na_omittedData$DepDelay
## t = 1189.8, df = 223870, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9286503 0.9297816
## sample estimates:
## cor
## 0.9292181
The result of the correlations shows a strong relationship with percentile greater tha 92%
Plot the departure delay verses the arrival delay to observe the relationship
y=get_na_omittedData$ArrDelay
x=get_na_omittedData$DepDelay
ggplot(get_na_omittedData, aes(x,y))+
geom_point(shape=1) +
geom_jitter(aes(colour = y))+
labs(title = "Arrival Delay vs Departure Delay")+
xlab("Departure Delay") +
ylab("Arrival Delay") +
geom_smooth(method=lm) From the graph and the linear correlation we can conclude that their is a relashinship between the two variables
y=y+70
y.exp <- rexp(1000, 1.297112e-02)
#Histogram of the original X variable (ArrDelay) before fit
hist(y.exp, main = "Arrival Delay", xlab = "ArrDelay ", ylab = "Frequency",
xaxp=c(-50,600,20), border = "green")# get the 5th Percentile
qexp(.05,rate=.297112e-02)## [1] 17.26396
#get the 95%
qexp(.95,rate=.297112e-02)## [1] 1008.284
sd(ArrDelay_y)## [1] 30.70852
e<-qt(0.975,df=length(ArrDelay_y)-1)*sd(ArrDelay_y)/sqrt(length(ArrDelay_y))
l <- mean(ArrDelay_y)-e
r <- mean(ArrDelay_y)+e
quantile(ArrDelay_y, prob=c(0.05,0.95))## 5% 95%
## -18 57
The correlation of 93% and the small P value shows a strong relationship between the two variables. The analysis indicates that departure delay is related to the cause of arrival delay. Looking at the graph, from the condensed congestion of the jitter points we can also determine that there is strong relationship.