#install.packages("hflights")

#loading requird packages
require (hflights)
## Loading required package: hflights
require (ggplot2)
## Loading required package: ggplot2
head (hflights)
##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

Define Variables

#converting all the na values to zero for consistancy
hflights$ArrDelay[ is.na( hflights$ArrDelay ) ] <- 0
hflights$DepDelay[ is.na( hflights$DepDelay ) ] <- 0

X <- hflights$ArrDelay
Y <- hflights$DepDelay

Generate summary level descriptive statistics

summary (X)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -70.000  -8.000   0.000   6.981  11.000 978.000
summary (Y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -33.000  -3.000   0.000   9.324   9.000 981.000
Xtotal <- length(X) - 3622
Xtotal
## [1] 223874
Ytotal <- length(Y) - 2905
Ytotal
## [1] 224591

Density plot for X and Y

ggplot(hflights, aes(X)) +
  geom_density()

ggplot(hflights, aes(Y)) +
  geom_density()

‘x’ as 3d quartile of ‘X’ and ‘y’ as 2d quartile ‘Y’

x <- quantile(X, .75)
x
## 75% 
##  11
y <- quantile(Y, .5)
y
## 50% 
##   0

Table of counts

a <- nrow(subset(hflights, X <= x & Y <= y));
a;
## [1] 111341
b <- nrow(subset(hflights, X <= x & Y > y));
b;
## [1] 61448
c <- nrow(subset(hflights, X > x & Y <= y));
c;
## [1] 6159
d <- nrow(subset(hflights, X > x & Y > y));
d;
## [1] 48548
values <- matrix(c(a,b,(a+b),c,d,(c+d),(a+c),(b+d),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(values) <- c("<=2d quartile",">2d quartile","Total")
rownames(values) <- c('<=3d quartile', '>3d quartile','Total')
values.table <- as.table(values)
values.table
##               <=2d quartile >2d quartile  Total
## <=3d quartile        111341        61448 172789
## >3d quartile           6159        48548  54707
## Total                117500       109996 227496

a. P(X>x | Y>y) b. P(X>x, Y>y) c. P(Xy)

# a. P(X>x | Y>y)
48548/109996
## [1] 0.4413615
# b.  P(X>x, Y>y)
48548/227496
## [1] 0.2134016
#c.  P(X<x | Y>y)
(nrow(subset(hflights, X < x & Y > y)))/(nrow(subset(hflights, Y > y)))
## [1] 0.5382196

Does P(A | B) = P(A) * P(B)?

#P(A | B) = P(X>x | Y>y)
48548/109996
## [1] 0.4413615
#P(A)
A <- nrow(subset(hflights, X > x))
A
## [1] 54707
pofA <- A/nrow(hflights)
pofA
## [1] 0.2404746
#P(B)
B <- nrow(subset(hflights, Y > y))
B
## [1] 109996
#P(A | B) is not equal to P(A) * P(B)
pofB <- B/nrow(hflights)
pofB
## [1] 0.4835074
#P(A) * P(B)
p <- pofA * pofB
p
## [1] 0.1162712
# P(A | B) = .4413 in not equal to P(A) * P(B) = .1163(X and Y variable are not independent)
#Chi Squere Test for association

chisq.test(X,Y)
## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  X and Y
## X-squared = 25198000, df = 197740, p-value < 2.2e-16
#We can see form the Chi Square test the p value is smaller than significance level of 0.05, we should reject the hypothesis that X (Arrival delay) and Y Departure dealy are independent

scatterplot of the two variables

qplot(X, Y, main = 'Relationship between Arrival dealay and Departure dalay',
      xlab = 'Arrival Delay', ylab = 'Departure Delay')

#Scatter plot indicates strong positive relationship between the two variable

95% confidence interval for the difference in the mean

t.test(X,Y)
## 
##  Welch Two Sample t-test
## 
## data:  X and Y
## t = -26.722, df = 453240, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.514811 -2.171108
## sample estimates:
## mean of x mean of y 
##  6.981384  9.324344
#95% confidence interval for the difference in mean -2.514811 to -2.171108. Values are negative because the mean of random variable X(6.981384) is smaller then mean of random variable Y(9.324344)

correlation matrix for two of the quantitative variables

data <- cbind(X,Y)
matrix <- cor(data, use = "complete")
matrix
##           X         Y
## X 1.0000000 0.9254458
## Y 0.9254458 1.0000000

Hypothesis test to see if correlation between these variables is 0 and provide a 99% confidence interval

cor.test(X,Y,conf.level=0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  X and Y
## t = 1165, df = 227490, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.9246667 0.9262172
## sample estimates:
##       cor 
## 0.9254458
# 99% confidence interval is 0.924667 to 0.9262172. Correlation between X and Y is not zero, therefore the variables are not independent. Correlation coeffieient of 0.9254458 points to strong dependance.

Invert the Correlation Matrix to Create the ‘Precision Matrix’

pmatrix <- solve(matrix)
pmatrix
##           X         Y
## X  6.966212 -6.446851
## Y -6.446851  6.966212

Multiply the Correlation Matrix by the Precision Matrix

matrix%*%pmatrix
##              X Y
## X 1.000000e+00 0
## Y 8.881784e-16 1

Multiply the Precision Matrix by the Correlation Matrix

pmatrix%*%matrix
##   X            Y
## X 1 8.881784e-16
## Y 0 1.000000e+00
#The results of both multiplication are inverse of each other

shift X so that the minimum value is above zero

Xshift  <- X+ abs(min(X)) + 1
summary (Xshift)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   63.00   71.00   77.98   82.00 1049.00

Load the MASS package

require(MASS)
## Loading required package: MASS
library(MASS)

run fitdistr to fit an exponential probability density function

Xexpo <- fitdistr(Xshift, "exponential")
Xexpo
##        rate    
##   1.282357e-02 
##  (2.688575e-05)

Optimal value of lambda for the distribution

1 / Xexpo$estimate
##     rate 
## 77.98138

1000 samples from the exponential distribution

sampex <- rexp(1000, Xexpo$estimate)
summary(sampex)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.1232  22.0300  54.0300  77.4200 107.4000 504.6000

Comparison of histogram of expotential sample with a histogram of original variable

qplot(sampex, geom="histogram", binwidth = 5, fill=I("blue"), col=I("black"))

qplot(X, geom="histogram", binwidth = 5, fill=I("blue"), col=I("black"))

#The expotential sample appears to be less right skwed then the original

5th Percentile

qexp(.05, rate = Xexpo$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 3.999922

95th Percentile

qexp(.95, rate = Xexpo$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 233.6113

95% confidence interval from the empirical data, assuming normality

meanX <- mean(na.omit(X)) # get the mean for X
meanX
## [1] 6.981384
sdX <- sqrt(var(na.omit(X)))
sdX
## [1] 30.47602
qnorm(.025, mean = meanX, sd = sdX)
## [1] -52.75051
qnorm(.975, mean = meanX, sd = sdX)
## [1] 66.71328
#If X is normally distributed 95% of the sample mean will be between -52.75051 and 66.71328

The empirical 5th percentile and 95th percentile of the data

#.05 quantile 
quantile(X, .05, na.rm = TRUE)
##  5% 
## -18
#.95 quantile
quantile(X, .95, na.rm = TRUE)
## 95% 
##  56
#GThe empirical 5th percentile and 95th percentil of the implies 90% of the sample X from original data falls between -18 and 56