#install.packages("hflights")
#loading requird packages
require (hflights)
## Loading required package: hflights
require (ggplot2)
## Loading required package: ggplot2
head (hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
Define Variables
#converting all the na values to zero for consistancy
hflights$ArrDelay[ is.na( hflights$ArrDelay ) ] <- 0
hflights$DepDelay[ is.na( hflights$DepDelay ) ] <- 0
X <- hflights$ArrDelay
Y <- hflights$DepDelay
Generate summary level descriptive statistics
summary (X)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -70.000 -8.000 0.000 6.981 11.000 978.000
summary (Y)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -33.000 -3.000 0.000 9.324 9.000 981.000
Xtotal <- length(X) - 3622
Xtotal
## [1] 223874
Ytotal <- length(Y) - 2905
Ytotal
## [1] 224591
Density plot for X and Y
ggplot(hflights, aes(X)) +
geom_density()

ggplot(hflights, aes(Y)) +
geom_density()

‘x’ as 3d quartile of ‘X’ and ‘y’ as 2d quartile ‘Y’
x <- quantile(X, .75)
x
## 75%
## 11
y <- quantile(Y, .5)
y
## 50%
## 0
Table of counts
a <- nrow(subset(hflights, X <= x & Y <= y));
a;
## [1] 111341
b <- nrow(subset(hflights, X <= x & Y > y));
b;
## [1] 61448
c <- nrow(subset(hflights, X > x & Y <= y));
c;
## [1] 6159
d <- nrow(subset(hflights, X > x & Y > y));
d;
## [1] 48548
values <- matrix(c(a,b,(a+b),c,d,(c+d),(a+c),(b+d),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(values) <- c("<=2d quartile",">2d quartile","Total")
rownames(values) <- c('<=3d quartile', '>3d quartile','Total')
values.table <- as.table(values)
values.table
## <=2d quartile >2d quartile Total
## <=3d quartile 111341 61448 172789
## >3d quartile 6159 48548 54707
## Total 117500 109996 227496
a. P(X>x | Y>y) b. P(X>x, Y>y) c. P(Xy)
# a. P(X>x | Y>y)
48548/109996
## [1] 0.4413615
# b. P(X>x, Y>y)
48548/227496
## [1] 0.2134016
#c. P(X<x | Y>y)
(nrow(subset(hflights, X < x & Y > y)))/(nrow(subset(hflights, Y > y)))
## [1] 0.5382196
Does P(A | B) = P(A) * P(B)?
#P(A | B) = P(X>x | Y>y)
48548/109996
## [1] 0.4413615
#P(A)
A <- nrow(subset(hflights, X > x))
A
## [1] 54707
pofA <- A/nrow(hflights)
pofA
## [1] 0.2404746
#P(B)
B <- nrow(subset(hflights, Y > y))
B
## [1] 109996
#P(A | B) is not equal to P(A) * P(B)
pofB <- B/nrow(hflights)
pofB
## [1] 0.4835074
#P(A) * P(B)
p <- pofA * pofB
p
## [1] 0.1162712
# P(A | B) = .4413 in not equal to P(A) * P(B) = .1163(X and Y variable are not independent)
#Chi Squere Test for association
chisq.test(X,Y)
## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: X and Y
## X-squared = 25198000, df = 197740, p-value < 2.2e-16
#We can see form the Chi Square test the p value is smaller than significance level of 0.05, we should reject the hypothesis that X (Arrival delay) and Y Departure dealy are independent
scatterplot of the two variables
qplot(X, Y, main = 'Relationship between Arrival dealay and Departure dalay',
xlab = 'Arrival Delay', ylab = 'Departure Delay')

#Scatter plot indicates strong positive relationship between the two variable
95% confidence interval for the difference in the mean
t.test(X,Y)
##
## Welch Two Sample t-test
##
## data: X and Y
## t = -26.722, df = 453240, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.514811 -2.171108
## sample estimates:
## mean of x mean of y
## 6.981384 9.324344
#95% confidence interval for the difference in mean -2.514811 to -2.171108. Values are negative because the mean of random variable X(6.981384) is smaller then mean of random variable Y(9.324344)
correlation matrix for two of the quantitative variables
data <- cbind(X,Y)
matrix <- cor(data, use = "complete")
matrix
## X Y
## X 1.0000000 0.9254458
## Y 0.9254458 1.0000000
Hypothesis test to see if correlation between these variables is 0 and provide a 99% confidence interval
cor.test(X,Y,conf.level=0.99)
##
## Pearson's product-moment correlation
##
## data: X and Y
## t = 1165, df = 227490, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.9246667 0.9262172
## sample estimates:
## cor
## 0.9254458
# 99% confidence interval is 0.924667 to 0.9262172. Correlation between X and Y is not zero, therefore the variables are not independent. Correlation coeffieient of 0.9254458 points to strong dependance.
Invert the Correlation Matrix to Create the ‘Precision Matrix’
pmatrix <- solve(matrix)
pmatrix
## X Y
## X 6.966212 -6.446851
## Y -6.446851 6.966212
Multiply the Correlation Matrix by the Precision Matrix
matrix%*%pmatrix
## X Y
## X 1.000000e+00 0
## Y 8.881784e-16 1
Multiply the Precision Matrix by the Correlation Matrix
pmatrix%*%matrix
## X Y
## X 1 8.881784e-16
## Y 0 1.000000e+00
#The results of both multiplication are inverse of each other
shift X so that the minimum value is above zero
Xshift <- X+ abs(min(X)) + 1
summary (Xshift)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 63.00 71.00 77.98 82.00 1049.00
Load the MASS package
require(MASS)
## Loading required package: MASS
library(MASS)
run fitdistr to fit an exponential probability density function
Xexpo <- fitdistr(Xshift, "exponential")
Xexpo
## rate
## 1.282357e-02
## (2.688575e-05)
Optimal value of lambda for the distribution
1 / Xexpo$estimate
## rate
## 77.98138
1000 samples from the exponential distribution
sampex <- rexp(1000, Xexpo$estimate)
summary(sampex)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1232 22.0300 54.0300 77.4200 107.4000 504.6000
Comparison of histogram of expotential sample with a histogram of original variable
qplot(sampex, geom="histogram", binwidth = 5, fill=I("blue"), col=I("black"))

qplot(X, geom="histogram", binwidth = 5, fill=I("blue"), col=I("black"))

#The expotential sample appears to be less right skwed then the original
5th Percentile
qexp(.05, rate = Xexpo$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 3.999922
95th Percentile
qexp(.95, rate = Xexpo$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 233.6113
95% confidence interval from the empirical data, assuming normality
meanX <- mean(na.omit(X)) # get the mean for X
meanX
## [1] 6.981384
sdX <- sqrt(var(na.omit(X)))
sdX
## [1] 30.47602
qnorm(.025, mean = meanX, sd = sdX)
## [1] -52.75051
qnorm(.975, mean = meanX, sd = sdX)
## [1] 66.71328
#If X is normally distributed 95% of the sample mean will be between -52.75051 and 66.71328
The empirical 5th percentile and 95th percentile of the data
#.05 quantile
quantile(X, .05, na.rm = TRUE)
## 5%
## -18
#.95 quantile
quantile(X, .95, na.rm = TRUE)
## 95%
## 56
#GThe empirical 5th percentile and 95th percentil of the implies 90% of the sample X from original data falls between -18 and 56