Data Description : Flights that departed Houston in 2011 Metadata: https://cran.r-project.org/web/packages/hflights/index.html

The following is an investigation of the hflights data package for the CUNY MSDA Bridge Course Data Science Math program.

The following is a step by step representation of a statistical analysis of two variables from the dataset showcasing probability, statistical, and linear algebraic methods.

Initialization

Setting up the environment. Running a function that removes ‘NA’ values from records. The data selected inclues two variables skewed to the right: Taxi Out and Departure Delay

#Remove warnings to prevent knitting errors.
options(warn=-1)

#Load hflights data
require(hflights)
## Loading required package: hflights
data<- hflights[,c("TaxiOut","DepDelay")]

#Delete records with "NA" values from TaxiOut and DepDelay fields.
completeFun <- function(data, desiredCols) {
  completeVec <- complete.cases(data[, desiredCols])
  return(data[completeVec, ])
}
data<-completeFun(data, c("TaxiOut","DepDelay"))

Basic Statistics

The following statistics are useful for future calculations

TaxiOut.mean <-mean(data$TaxiOut)
TaxiOut.median <-median(data$TaxiOut)
TaxiOut.mode <- as.numeric(names(sort(-table(data$TaxiOut))))[1]
TaxiOut.sd <- sd(data$TaxiOut)
DepDelay.mean <-mean(data$DepDelay)
DepDelay.median <-median(data$DepDelay)
DepDelay.mode <- as.numeric(names(sort(-table(data$DepDelay))))[1]
DepDelay.sd <- sd(data$DepDelay)
summary(data)
##     TaxiOut          DepDelay      
##  Min.   :  1.00   Min.   :-33.000  
##  1st Qu.: 10.00   1st Qu.: -3.000  
##  Median : 14.00   Median :  0.000  
##  Mean   : 15.09   Mean   :  9.436  
##  3rd Qu.: 18.00   3rd Qu.:  9.000  
##  Max.   :163.00   Max.   :981.000

Probability and Counts

For the following probability questions we assume the small letter “x” as the 3rd quartile, and the small letter “y” as the 2nd quartile of variables X (TaxiOut) and Y (DepDelay), respectively. Probability ranges are calculated in a secondary equation.

#Assign quartile values to variables.
XQ3<-quantile(data$TaxiOut, 0.75)
YQ2<-quantile(data$DepDelay, 0.50)
YQ3<-quantile(data$TaxiOut, 0.75)

#Create subsets of data based on quartile operators.
Yy <- subset(data, DepDelay > YQ2)
yY <- subset(data, DepDelay <= YQ2)
Xx <- subset(data, TaxiOut > XQ3)
xX <- subset(data, TaxiOut <= XQ3)

XxYy<- subset(Yy, TaxiOut > XQ3)
XxyY<- subset(yY, TaxiOut > XQ3)
xXYy<- subset(Yy, TaxiOut <= XQ3)
xXyY<- subset(yY, TaxiOut <= XQ3)

#Count the results of the subsets. The calculate their probabilities.

#P(X>x|Y>y)
a<-nrow(XxYy)
nrow(XxYy)/nrow(data)
## [1] 0.1151508
#P(X>x|y<Y)
b<-nrow(XxyY)
nrow(XxyY)/nrow(data)
## [1] 0.1155917
#P(X<x|Y>y)
c<-nrow(xXYy)
nrow(xXYy)/nrow(data)
## [1] 0.3746042
#P(X<x|y<Y)
d<-nrow(xXyY)
nrow(xXyY)/nrow(data)
## [1] 0.3946533
#Count and summarize the count data into a table. 
values <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(values) <- c("<=2d quartile",">2d quartile","Total")
rownames(values) <- c('<=3d quartile', '>3d quartile','Total')
values.table <- as.table(values)
values.table
##               <=2d quartile >2d quartile  Total
## <=3d quartile         88619        84117 172736
## >3d quartile          25956        25857  51813
## Total                114575       109974 224549

The data is independent when tabulated in this way. There is no method used that shows variable X’s influence on variable Y.

QQ Plots

#Main QQ Plot
qqplot(data$TaxiOut,data$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ Plot")

#QQ Plots per quartile
qqplot(xXyY$TaxiOut,xXyY$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ xXyY")

qqplot(xXYy$TaxiOut,xXYy$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ xXYy")

qqplot(XxyY$TaxiOut,XxyY$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ XxyY")

qqplot(XxYy$TaxiOut,XxYy$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ XxYy")

If we make a new 3rd quartile variable for X called A, and a new 2nd quartile variable for Y called B

#Observations above the 3d quartile for X
Xx_A <- subset(data, TaxiOut >= XQ3)

#Observations for the 2d quartile for Y
Yy_B_1 <- subset(data, DepDelay <= YQ3)
YY_B_2 <- subset(Yy_B_1, DepDelay >= YQ2)

Does P(A|B)=P(A)P(B)? Not Quite, but it is very close.

#P(A|B)
YY_XX <- subset(YY_B_2, TaxiOut >= XQ3)
nrow(YY_XX)/nrow(data)
## [1] 0.1171905
#P(A)P(B)
(nrow(Xx_A)/nrow(data))*nrow(YY_B_2)/nrow(data)
## [1] 0.1124978

Chi-Square test for association. We recieve a p value of 0.00000000000000022. Therefore, we reject the null hypothesis. The values are significant and therefore our data is independent.

require(MASS)
## Loading required package: MASS
chi_table<- table(data$TaxiOut, data$DepDelay)
chisq.test(chi_table)
## 
##  Pearson's Chi-squared test
## 
## data:  chi_table
## X-squared = 134320, df = 60634, p-value < 2.2e-16

Descriptive and Inferential Statistics

Variable Density Plots

require(psych)
## Loading required package: psych
describe(data)
##          vars      n  mean    sd median trimmed  mad min max range skew
## TaxiOut     1 224549 15.09  7.74     14   14.12 5.93   1 163   162 3.26
## DepDelay    2 224549  9.44 28.76      0    3.36 5.93 -33 981  1014 5.96
##          kurtosis   se
## TaxiOut     26.99 0.02
## DepDelay    71.73 0.06
#Create density plot for TaxiOut variable
d_TaxiOut <- density(data$TaxiOut)
plot(d_TaxiOut, main="Taxi Out Probabilities", ylab="Probability", xlab="Departure Delay (in minutes)")
polygon(d_TaxiOut, col="red")
abline(v = TaxiOut.median, col = "green", lwd = 2)
abline(v = TaxiOut.mean, col = "blue", lwd = 2)
abline(v = TaxiOut.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)

d_TaxiOut <- density(data$TaxiOut)

#Create density plot for DepDelay variable.
#Reduced the data for the plot to cut-off at X < 100 for clarity.
DepDelay_for_graph <- subset(data, DepDelay < 100)
DepDelay_for_graph <- density(DepDelay_for_graph$DepDelay, na.rm=TRUE)
plot(DepDelay_for_graph, main="Departure Delay Probabilities", ylab="Probability", xlab="Departure Delay (in minutes)")
polygon(DepDelay_for_graph, col="red")
abline(v = DepDelay.median, col = "green", lwd = 2)
abline(v = DepDelay.mean, col = "blue", lwd = 2)
abline(v = DepDelay.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8, )

#QQ plots

Scatterplot

plot(data$TaxiOut,data$DepDelay, main="TaxiOut vs Delay Scatterplot", 
     xlab="Taxi Out (Time)", ylab="Departure Delay (Time)", pch=19)

Provide a 95% Confidence Interval for the difference in the mean of the variables. With a Two Sample t-test we recieve 5.531988 5.778389

t.test(data$TaxiOut,data$DepDelay)
## 
##  Welch Two Sample t-test
## 
## data:  data$TaxiOut and data$DepDelay
## t = 89.967, df = 256900, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  5.531988 5.778389
## sample estimates:
## mean of x mean of y 
## 15.091098  9.435909

Derive a correlation matrix for two of the quantitative variables you selected. We see no linear relationship between the data. Furthermore, a correlation matrix shows low correlation coefficient values between the data.

corMatrix<-cor(data)
corMatrix
##              TaxiOut    DepDelay
## TaxiOut  1.000000000 0.002026774
## DepDelay 0.002026774 1.000000000

Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. With another Two Sample t-test we recieve 5.493275 5.817102

t.test(data$TaxiOut,data$DepDelay, conf.level=0.99)
## 
##  Welch Two Sample t-test
## 
## data:  data$TaxiOut and data$DepDelay
## t = 89.967, df = 256900, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  5.493275 5.817102
## sample estimates:
## mean of x mean of y 
## 15.091098  9.435909

Linear Algebra and Correlation

Invert your correlation matrix (Precision matrix). The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.

require(MASS)
corMatrixInverse <- ginv(corMatrix)
corMatrixInverse
##              [,1]         [,2]
## [1,]  1.000004108 -0.002026782
## [2,] -0.002026782  1.000004108

Multiply the correlation matrix by the precision matrix. This is the identity Matrix

matrix1<- corMatrixInverse %*% corMatrix
matrix1
##           TaxiOut      DepDelay
## [1,] 1.000000e+00 -1.687019e-16
## [2,] 1.084202e-16  1.000000e+00

Then multiply the precision matrix by the correlation matrix. This represents how matrix products differ depending on the order or direction by which they are multiplied.

matrix2<- corMatrix %*% corMatrixInverse
matrix2
##                  [,1]          [,2]
## TaxiOut  1.000000e+00 -1.691355e-16
## DepDelay 1.088539e-16  1.000000e+00

Calculus-Based Probability & Statistics

We take the right-skewed TaxiOut data and fit it to an exponential function

#Using the MASS library
#Calculate the optimal lambda for right-skewed TaxiOut
lambda<-fitdistr(data$TaxiOut,"exponential")
lambda$estimate
##       rate 
## 0.06626423
#Transpose the rate into 1000 selected variables as an exponential distribution
pdf_distr<-rexp(1000,lambda$estimate)

#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with TaxiOut data", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

#Plot the results as compared to the original data
hist(data$TaxiOut, freq = FALSE, breaks = 100, main ="Comparison with original TaxiOut data",xlim = c(1, quantile(data$TaxiOut, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)

With the exponential PDF:

5th and 95th percentiles using the cumulative distribution function (CDF)

qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 0.7740721
qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 45.20889

95% confidence interval from the empirical data, assuming normality

qnorm(0.95,TaxiOut.mean, TaxiOut.sd)
## [1] 27.82288

Empirical 5th percentile and 95th percentile of the data

quantile(data$TaxiOut, c(.05, .95))
##  5% 95% 
##   7  28

These exercises help us recognize the differences between a exponential equation and our right-tailed data. We are able to find an approximation and likely a connection between an exponential function, but only after the mean, and therefore this is a noticeably incomplete representation.