require(hflights)
## Loading required package: hflights
#Arrival Delays is my random variable X
#This variable is skewed to the right
#Median is less than the mean
my.ArrDelay <- hflights$ArrDelay
hist(my.ArrDelay, xaxp=c(-50,300,20))

summary(my.ArrDelay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -70.000 -8.000 0.000 7.094 11.000 978.000 3622
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -70.000 -8.000 0.000 7.094 11.000 978.000 3622
length(my.ArrDelay)
## [1] 227496
# N - took out NA's
# N = 227496 - 3622
227496 - 3622
## [1] 223874
223874
## [1] 223874
#Deparature Delays is my random variable Y
#This variable is also skewed to the right
#Median is less than the mean
my.DepDelay <- hflights$DepDelay
hist(my.DepDelay, xaxp=c(-50,300,20))

summary(my.DepDelay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -33.000 -3.000 0.000 9.445 9.000 981.000 2905
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#-33.000 -3.000 0.000 9.445 9.000 981.000 2905
length(my.DepDelay)
## [1] 227496
# N = 227496 - 2905
# N - took out NA's
227496 - 2905
## [1] 224591
224591
## [1] 224591
#Probabilty PROBLEMS
#a. P(X>x | Y>y) = 0.4323236
#P(X>x | Y>y) = (P(X>x)intersection P(Y>y))/P(Y>y)
#P(Y>y) = 0.5
nrow(subset(hflights, ArrDelay > 11.000 & DepDelay > 0.000))
## [1] 48548
#48548 = (P(X>x)intersection P(Y>y)) - number of rows that meet the condition
(48548/224591)/0.5
## [1] 0.4323236
0.4323236
## [1] 0.4323236
#b.P(X>x, Y>y) = 0.125
0.25*0.50
## [1] 0.125
#c. P(X<x | Y>y)
#P(X<x | Y>y) = P(X<x)intersection P(Y>y)/P(Y>y)
nrow(subset(hflights, ArrDelay < 11.000 & DepDelay > 0.000))
## [1] 58780
58780
## [1] 58780
(58780/224591)/0.5
## [1] 0.5234404
0.5234404
## [1] 0.5234404
#d. P(X<x | Y<y)
#P(X<x | Y<y) = P(X<x)intersection P(Y<y)/P(Y<y)
nrow(subset(hflights, ArrDelay < 11.000 & DepDelay < 0.000))
## [1] 90992
90992
## [1] 90992
(90992/224591)/0.5
## [1] 0.8102907
0.8102907
## [1] 0.8102907
# x/y <=2d quartile >2d quartile Total
# <=3d quartile 0.482 0.272 0.754
# >3d quartile 0.027 0.216 0.243
# Total 0.509 0.488 1.00
#No. Splitting the data above does not make them more independent. In fact, it shows the possible relationship
#between the 2 variables. It shows the common values for the X and Y variables
nrow(subset(hflights, ArrDelay <= 11.000 & DepDelay <= 0.000))
## [1] 108141
108141/224591
## [1] 0.4815019
0.4815019
## [1] 0.4815019
nrow(subset(hflights, ArrDelay <= 11.000 & DepDelay > 0.000))
## [1] 61026
61026/224591
## [1] 0.2717206
0.2717206
## [1] 0.2717206
nrow(subset(hflights, ArrDelay > 11.000 & DepDelay <= 0.000))
## [1] 6159
6159/224591
## [1] 0.02742318
0.02742318
## [1] 0.02742318
nrow(subset(hflights, ArrDelay > 11.000 & DepDelay > 0.000))
## [1] 48548
48548/224591
## [1] 0.2161618
0.2161618
## [1] 0.2161618
#A -
nrow(subset(hflights, ArrDelay > 11.000))
## [1] 54707
54707
## [1] 54707
#P(A)
54707
## [1] 54707
0.243585
## [1] 0.243585
#P(B)
nrow(subset(hflights, DepDelay > 0.000))
## [1] 109996
109996
## [1] 109996
109996/224591
## [1] 0.4897614
0.4897614
## [1] 0.4897614
#P(A)* P(B)
0.243585 * 0.4897614
## [1] 0.1192985
0.1192985
## [1] 0.1192985
#P(A|B) = 0.4323236
#P(A)* P(B) not equal to P(A|B)
#create the correlation matrix for the correlation between ArrDelay and DepDelay
tbl <- matrix(c(0.482,0.027,0.272,0.216),ncol=2)
tbl
## [,1] [,2]
## [1,] 0.482 0.272
## [2,] 0.027 0.216
chisq.test(tbl, correct=FALSE)
## Warning in chisq.test(tbl, correct = FALSE): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 0.20514, df = 1, p-value = 0.6506
#data: tbl
#X-squared = 0.20514, df = 1, p-value = 0.6506
#since p-value of 0.6506 is greater than .05, we can reject the null hypothesis a
#there is a dependence between the x random variable (arrival delay) and the y random variable (departure delay)
# DESCRIPTIVE and INFERENTIAL STATISTICS
#Arrival Delays is my random variable X
#This variable is skewed to the right
#Median is less than the mean
my.ArrDelay <- hflights$ArrDelay
hist(my.ArrDelay, xaxp=c(-50,300,20))

summary(my.ArrDelay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -70.000 -8.000 0.000 7.094 11.000 978.000 3622
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -70.000 -8.000 0.000 7.094 11.000 978.000 3622
var(hflights$ArrDelay, na.rm=TRUE)
## [1] 943.013
943.013
## [1] 943.013
length(my.ArrDelay)
## [1] 227496
# N = 227496 - 3622
# take out the NA's to arrive at the true sample n
227496 - 3622
## [1] 223874
223874
## [1] 223874
#Deparature Delays is my random variable Y
#This variable is also skewed to the right
#Median is less than the mean
my.DepDelay <-
hist(my.DepDelay, xaxp=c(-50,300,20))

summary(my.DepDelay)
## Length Class Mode
## breaks 22 -none- numeric
## counts 21 -none- numeric
## density 21 -none- numeric
## mids 21 -none- numeric
## xname 1 -none- character
## equidist 1 -none- logical
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#-33.000 -3.000 0.000 9.445 9.000 981.000 2905
var(hflights$DepDelay, na.rm = TRUE)
## [1] 829.6482
829.6482
## [1] 829.6482
length(my.DepDelay)
## [1] 6
# N = 227496 - 2905
227496 - 2905
## [1] 224591
224591
## [1] 224591
#scatterplot of variables X (ArrDelay) and Y (DepDelay)
plot(hflights$ArrDelay, hflights$DepDelay, main="Arrival/Departure Delays Relationship", xlab="Arrival Delay", ylab="Departure Delay", pch=19)

MSE <- (829.6482 + 943.013)/2
MSE
## [1] 886.3306
886.3306
## [1] 886.3306
sm1m2 <- sqrt((2*MSE)/((224591+223874)/2))
sm1m2
## [1] 0.08891266
0.08891266
## [1] 0.08891266
#degrees of freedom
224591+223874-2
## [1] 448463
448463
## [1] 448463
#t = 1.96 for 95% confidence interval
M2M1 <- 9.445 - 7.094
M2M1
## [1] 2.351
2.351
## [1] 2.351
#for 95% confidence interval
lower.limit = 2.351-(1.96)*(0.08891266)
lower.limit
## [1] 2.176731
upper.limit = 2.351-(1.96)*(0.08891266)
upper.limit
## [1] 2.176731
#correlation matrix
my.flights <- na.omit(hflights)
cor(my.flights$ArrDelay, my.flights$DepDelay)
## [1] 0.9292181
0.9292181
## [1] 0.9292181
my.corrmat = matrix(c(1.0,0.93,0.93,1), ncol =2)
my.corrmat
## [,1] [,2]
## [1,] 1.00 0.93
## [2,] 0.93 1.00
#Z' = 1.653
my.stderr = 1/sqrt(224232.5-3)
my.stderr
## [1] 0.002111804
lower.limit = 1.653 - (2.58)*(0.002)
lower.limit
## [1] 1.64784
1.64784
## [1] 1.64784
upper.limit = 1.653 + (2.58)*(0.002)
upper.limit
## [1] 1.65816
1.65816
## [1] 1.65816
#convert back to r values (99% confidence interval)
#0.929 < p < 0.930
#conclusion - we reject the null hypothesis that arrival and departure delay have 0 correlation.
#In fact, they are very strongly correlated.
#Linear Algebra and Correlation
#Invert the correlation matrix to get the precision matrix
my.presmat <- solve(my.corrmat)
my.presmat
## [,1] [,2]
## [1,] 7.401925 -6.883790
## [2,] -6.883790 7.401925
#Multiply the correlation matrix by the precision matrix
my.id <- my.corrmat%*%my.presmat
#Multiply the the precision matrix by the correlation matrix
my.id2 <- my.presmat%*%my.corrmat
#Calculus based Probability and Statistics
my.ArrDelay <- my.flights$ArrDelay
my.ArrDelay <- my.ArrDelay + 70
summary(my.ArrDelay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 62.00 70.00 77.09 81.00 1048.00
library(MASS)
#run to fit an exponential distribution
fitdistr(my.ArrDelay, "exponential")
## rate
## 1.297112e-02
## (2.741421e-05)
#rate - optimal value of lambda
#1.297112e-02
#(2.741421e-05)
my.ArrDelay.exp <- rexp(1000, 1.297112e-02)
#Histogram of the original X variable (ArrDelay) before fit
hist(my.ArrDelay, xaxp=c(-50,300,20))

#Histogram of the original X variable (ArrDelay) after fit
hist(my.ArrDelay.exp, yaxp=c(0,500,10))

#5th percentile using cdf
qexp(.05,rate=.297112e-02)
## [1] 17.26396
17.26396
## [1] 17.26396
#95th percentile using cdf
qexp(.95,rate=.297112e-02)
## [1] 1008.284
1008.284
## [1] 1008.284
#determine n and standard deviation
length(my.ArrDelay)
## [1] 223874
sd(my.ArrDelay)
## [1] 30.70852
#determine standard error
error <- qt(0.975,df=length(my.ArrDelay)-1)*sd(my.ArrDelay)/sqrt(length(my.ArrDelay))
0.127206
## [1] 0.127206
left <- mean(my.ArrDelay)-error
right <- mean(my.ArrDelay)+error
#95% confidence interval for empirical data
# 76.96713 <= mean <= 77.2215
#empiral 5th and 95th percentile of empirlcal data
quantile(my.ArrDelay, prob=c(0.05,0.95))
## 5% 95%
## 52 127