require(hflights)
## Loading required package: hflights
#Arrival Delays is my random variable X
#This variable is skewed to the right
#Median is less than the mean
my.ArrDelay <- hflights$ArrDelay
hist(my.ArrDelay, xaxp=c(-50,300,20))

summary(my.ArrDelay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -70.000  -8.000   0.000   7.094  11.000 978.000    3622
## Min.    1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -70.000  -8.000   0.000   7.094  11.000 978.000    3622 
length(my.ArrDelay)
## [1] 227496
# N - took out NA's
# N = 227496 - 3622
227496 - 3622
## [1] 223874
223874
## [1] 223874
#Deparature Delays is my random variable Y
#This variable is also skewed to the right
#Median is less than the mean
my.DepDelay <- hflights$DepDelay
hist(my.DepDelay, xaxp=c(-50,300,20))

summary(my.DepDelay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -33.000  -3.000   0.000   9.445   9.000 981.000    2905
#Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#-33.000  -3.000   0.000   9.445   9.000 981.000    2905 
length(my.DepDelay)
## [1] 227496
# N = 227496 - 2905
# N - took out NA's
227496 - 2905
## [1] 224591
224591
## [1] 224591
#Probabilty PROBLEMS

#a. P(X>x | Y>y) = 0.4323236
#P(X>x | Y>y) = (P(X>x)intersection P(Y>y))/P(Y>y)
#P(Y>y) = 0.5
nrow(subset(hflights, ArrDelay > 11.000 & DepDelay > 0.000))
## [1] 48548
#48548 = (P(X>x)intersection P(Y>y)) - number of rows that meet the condition
(48548/224591)/0.5
## [1] 0.4323236
0.4323236
## [1] 0.4323236
#b.P(X>x, Y>y) = 0.125
0.25*0.50
## [1] 0.125
#c. P(X<x | Y>y)
#P(X<x | Y>y) = P(X<x)intersection P(Y>y)/P(Y>y)
nrow(subset(hflights, ArrDelay < 11.000 & DepDelay > 0.000))
## [1] 58780
58780
## [1] 58780
(58780/224591)/0.5
## [1] 0.5234404
0.5234404
## [1] 0.5234404
#d. P(X<x | Y<y)
#P(X<x | Y<y) = P(X<x)intersection P(Y<y)/P(Y<y)
nrow(subset(hflights, ArrDelay < 11.000 & DepDelay < 0.000))
## [1] 90992
90992
## [1] 90992
(90992/224591)/0.5
## [1] 0.8102907
0.8102907
## [1] 0.8102907
# x/y                <=2d quartile   >2d quartile    Total
# <=3d quartile          0.482         0.272         0.754  
# >3d quartile           0.027         0.216         0.243   
# Total                  0.509         0.488         1.00

#No. Splitting the data above does not make them more independent. In fact, it shows the possible relationship
#between the 2 variables. It shows the common values for the X and Y variables
nrow(subset(hflights, ArrDelay <= 11.000 & DepDelay <= 0.000))
## [1] 108141
108141/224591
## [1] 0.4815019
0.4815019
## [1] 0.4815019
nrow(subset(hflights, ArrDelay <= 11.000 & DepDelay > 0.000))
## [1] 61026
61026/224591
## [1] 0.2717206
0.2717206
## [1] 0.2717206
nrow(subset(hflights, ArrDelay > 11.000 & DepDelay <= 0.000))
## [1] 6159
6159/224591
## [1] 0.02742318
0.02742318
## [1] 0.02742318
nrow(subset(hflights, ArrDelay > 11.000 & DepDelay > 0.000))
## [1] 48548
48548/224591
## [1] 0.2161618
0.2161618
## [1] 0.2161618
#A - 
nrow(subset(hflights, ArrDelay > 11.000))
## [1] 54707
54707
## [1] 54707
#P(A)
54707
## [1] 54707
0.243585
## [1] 0.243585
#P(B)
nrow(subset(hflights, DepDelay > 0.000))
## [1] 109996
109996
## [1] 109996
109996/224591
## [1] 0.4897614
0.4897614
## [1] 0.4897614
#P(A)* P(B)
0.243585 * 0.4897614
## [1] 0.1192985
0.1192985
## [1] 0.1192985
#P(A|B) = 0.4323236
#P(A)* P(B) not equal to P(A|B)

#create the correlation matrix for the correlation between ArrDelay and DepDelay
tbl <- matrix(c(0.482,0.027,0.272,0.216),ncol=2)
tbl
##       [,1]  [,2]
## [1,] 0.482 0.272
## [2,] 0.027 0.216
chisq.test(tbl, correct=FALSE)
## Warning in chisq.test(tbl, correct = FALSE): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 0.20514, df = 1, p-value = 0.6506
#data:  tbl
#X-squared = 0.20514, df = 1, p-value = 0.6506
#since p-value of 0.6506 is greater than .05, we can reject the null hypothesis a
#there is a dependence between the x random variable (arrival delay) and the y random variable (departure delay)



# DESCRIPTIVE and INFERENTIAL STATISTICS
#Arrival Delays is my random variable X
#This variable is skewed to the right
#Median is less than the mean
my.ArrDelay <- hflights$ArrDelay
hist(my.ArrDelay, xaxp=c(-50,300,20))

summary(my.ArrDelay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -70.000  -8.000   0.000   7.094  11.000 978.000    3622
## Min.    1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -70.000  -8.000   0.000   7.094  11.000 978.000    3622 
var(hflights$ArrDelay, na.rm=TRUE)
## [1] 943.013
943.013
## [1] 943.013
length(my.ArrDelay)
## [1] 227496
# N = 227496 - 3622
# take out the NA's to arrive at the true sample n
227496 - 3622
## [1] 223874
223874
## [1] 223874
#Deparature Delays is my random variable Y
#This variable is also skewed to the right
#Median is less than the mean
my.DepDelay <- 
hist(my.DepDelay, xaxp=c(-50,300,20))

summary(my.DepDelay)
##          Length Class  Mode     
## breaks   22     -none- numeric  
## counts   21     -none- numeric  
## density  21     -none- numeric  
## mids     21     -none- numeric  
## xname     1     -none- character
## equidist  1     -none- logical
#Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
#-33.000  -3.000   0.000   9.445   9.000 981.000    2905 
var(hflights$DepDelay, na.rm = TRUE)
## [1] 829.6482
829.6482
## [1] 829.6482
length(my.DepDelay)
## [1] 6
# N = 227496 - 2905
227496 - 2905
## [1] 224591
224591
## [1] 224591
#scatterplot of variables X (ArrDelay) and Y (DepDelay) 
plot(hflights$ArrDelay, hflights$DepDelay, main="Arrival/Departure Delays Relationship", xlab="Arrival Delay", ylab="Departure Delay", pch=19)

MSE <- (829.6482 + 943.013)/2
MSE
## [1] 886.3306
886.3306
## [1] 886.3306
sm1m2 <- sqrt((2*MSE)/((224591+223874)/2))
sm1m2
## [1] 0.08891266
0.08891266
## [1] 0.08891266
#degrees of freedom
224591+223874-2
## [1] 448463
448463
## [1] 448463
#t = 1.96 for 95% confidence interval

M2M1 <- 9.445 - 7.094
M2M1
## [1] 2.351
2.351
## [1] 2.351
#for 95% confidence interval
lower.limit = 2.351-(1.96)*(0.08891266)
lower.limit
## [1] 2.176731
upper.limit = 2.351-(1.96)*(0.08891266)
upper.limit
## [1] 2.176731
#correlation matrix
my.flights <- na.omit(hflights)
cor(my.flights$ArrDelay, my.flights$DepDelay)
## [1] 0.9292181
0.9292181
## [1] 0.9292181
my.corrmat = matrix(c(1.0,0.93,0.93,1), ncol =2)
my.corrmat
##      [,1] [,2]
## [1,] 1.00 0.93
## [2,] 0.93 1.00
#Z' = 1.653
my.stderr = 1/sqrt(224232.5-3)
my.stderr
## [1] 0.002111804
lower.limit = 1.653 - (2.58)*(0.002)
lower.limit
## [1] 1.64784
1.64784
## [1] 1.64784
upper.limit = 1.653 + (2.58)*(0.002)
upper.limit
## [1] 1.65816
1.65816
## [1] 1.65816
#convert back to r values (99% confidence interval)
#0.929 < p < 0.930

#conclusion - we reject the null hypothesis that arrival and departure delay have 0 correlation.
#In fact, they are very strongly correlated.


#Linear Algebra and Correlation

#Invert the correlation matrix to get the precision matrix
my.presmat <- solve(my.corrmat)
my.presmat
##           [,1]      [,2]
## [1,]  7.401925 -6.883790
## [2,] -6.883790  7.401925
#Multiply the correlation matrix by the precision matrix
my.id <- my.corrmat%*%my.presmat
#Multiply the the precision matrix by the correlation matrix
my.id2 <- my.presmat%*%my.corrmat


#Calculus based Probability and Statistics

my.ArrDelay <- my.flights$ArrDelay
my.ArrDelay <- my.ArrDelay + 70
summary(my.ArrDelay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   62.00   70.00   77.09   81.00 1048.00
library(MASS)
#run to fit an exponential distribution
fitdistr(my.ArrDelay, "exponential")
##        rate    
##   1.297112e-02 
##  (2.741421e-05)
#rate   - optimal value of lambda  
#1.297112e-02 
#(2.741421e-05)
my.ArrDelay.exp <- rexp(1000, 1.297112e-02)

#Histogram of the original X variable (ArrDelay) before fit
hist(my.ArrDelay, xaxp=c(-50,300,20))

#Histogram of the original X variable (ArrDelay) after fit
hist(my.ArrDelay.exp, yaxp=c(0,500,10))

#5th percentile using cdf
qexp(.05,rate=.297112e-02)
## [1] 17.26396
17.26396
## [1] 17.26396
#95th percentile using cdf
qexp(.95,rate=.297112e-02)
## [1] 1008.284
1008.284
## [1] 1008.284
#determine n and standard deviation
length(my.ArrDelay)
## [1] 223874
sd(my.ArrDelay)
## [1] 30.70852
#determine standard error
error <- qt(0.975,df=length(my.ArrDelay)-1)*sd(my.ArrDelay)/sqrt(length(my.ArrDelay))
0.127206
## [1] 0.127206
left <- mean(my.ArrDelay)-error
right <- mean(my.ArrDelay)+error
#95% confidence interval for empirical data
# 76.96713 <= mean <= 77.2215

#empiral 5th and 95th percentile of empirlcal data
quantile(my.ArrDelay, prob=c(0.05,0.95))
##  5% 95% 
##  52 127