Data Description : Flights that departed Houston in 2011 Metadata: https://cran.r-project.org/web/packages/hflights/index.html
The following is an investigation of the hflights data package for the CUNY MSDA Bridge Course Data Science Math program.
The following is a step by step representation of a statistical analysis of two variables from the dataset showcasing probability, statistical, and linear algebraic methods.
Setting up the environment. Running a function that removes ‘NA’ values from records. The data selected inclues two variables skewed to the right: Taxi Out and Departure Delay
#Remove warnings to prevent knitting errors.
options(warn=-1)
#Load hflights data
require(hflights)
## Loading required package: hflights
data<- hflights[,c("TaxiOut","DepDelay")]
#Delete records with "NA" values from TaxiOut and DepDelay fields.
completeFun <- function(data, desiredCols) {
completeVec <- complete.cases(data[, desiredCols])
return(data[completeVec, ])
}
data<-completeFun(data, c("TaxiOut","DepDelay"))
The following statistics are useful for future calculations
TaxiOut.mean <-mean(data$TaxiOut)
TaxiOut.median <-median(data$TaxiOut)
TaxiOut.mode <- as.numeric(names(sort(-table(data$TaxiOut))))[1]
TaxiOut.sd <- sd(data$TaxiOut)
DepDelay.mean <-mean(data$DepDelay)
DepDelay.median <-median(data$DepDelay)
DepDelay.mode <- as.numeric(names(sort(-table(data$DepDelay))))[1]
DepDelay.sd <- sd(data$DepDelay)
summary(data)
## TaxiOut DepDelay
## Min. : 1.00 Min. :-33.000
## 1st Qu.: 10.00 1st Qu.: -3.000
## Median : 14.00 Median : 0.000
## Mean : 15.09 Mean : 9.436
## 3rd Qu.: 18.00 3rd Qu.: 9.000
## Max. :163.00 Max. :981.000
For the following probability questions we assume the small letter “x” as the 3rd quartile, and the small letter “y” as the 2nd quartile of variables X (TaxiOut) and Y (DepDelay), respectively. Probability ranges are calculated in a secondary equation.
#Assign quartile values to variables.
XQ3<-quantile(data$TaxiOut, 0.75)
YQ2<-quantile(data$DepDelay, 0.50)
YQ3<-quantile(data$TaxiOut, 0.75)
#Create subsets of data based on quartile operators.
Yy <- subset(data, DepDelay > YQ2)
yY <- subset(data, DepDelay <= YQ2)
Xx <- subset(data, TaxiOut > XQ3)
xX <- subset(data, TaxiOut <= XQ3)
XxYy<- subset(Yy, TaxiOut > XQ3)
XxyY<- subset(yY, TaxiOut > XQ3)
xXYy<- subset(Yy, TaxiOut <= XQ3)
xXyY<- subset(yY, TaxiOut <= XQ3)
#Count the results of the subsets. The calculate their probabilities.
#P(X>x|Y>y)
a<-nrow(XxYy)
nrow(XxYy)/nrow(data)
## [1] 0.1151508
#P(X>x|y<Y)
b<-nrow(XxyY)
nrow(XxyY)/nrow(data)
## [1] 0.1155917
#P(X<x|Y>y)
c<-nrow(xXYy)
nrow(xXYy)/nrow(data)
## [1] 0.3746042
#P(X<x|y<Y)
d<-nrow(xXyY)
nrow(xXyY)/nrow(data)
## [1] 0.3946533
#Count and summarize the count data into a table.
values <- matrix(c(d,c,(d+c),b,a,(b+a),(b+d),(c+a),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(values) <- c("<=2d quartile",">2d quartile","Total")
rownames(values) <- c('<=3d quartile', '>3d quartile','Total')
values.table <- as.table(values)
values.table
## <=2d quartile >2d quartile Total
## <=3d quartile 88619 84117 172736
## >3d quartile 25956 25857 51813
## Total 114575 109974 224549
The data is independent when tabulated in this way. There is no method used that shows variable X’s influence on variable Y.
QQ Plots
#Main QQ Plot
qqplot(data$TaxiOut,data$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ Plot")
#QQ Plots per quartile
qqplot(xXyY$TaxiOut,xXyY$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ xXyY")
qqplot(xXYy$TaxiOut,xXYy$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ xXYy")
qqplot(XxyY$TaxiOut,XxyY$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ XxyY")
qqplot(XxYy$TaxiOut,XxYy$DepDelay, xlab="Taxi Out Values", ylab="Departure Delay Values", main="QQ XxYy")
If we make a new 3rd quartile variable for X called A, and a new 2nd quartile variable for Y called B
#Observations above the 3d quartile for X
Xx_A <- subset(data, TaxiOut >= XQ3)
#Observations for the 2d quartile for Y
Yy_B_1 <- subset(data, DepDelay <= YQ3)
YY_B_2 <- subset(Yy_B_1, DepDelay >= YQ2)
Does P(A|B)=P(A)P(B)? Not Quite, but it is very close.
#P(A|B)
YY_XX <- subset(YY_B_2, TaxiOut >= XQ3)
nrow(YY_XX)/nrow(data)
## [1] 0.1171905
#P(A)P(B)
(nrow(Xx_A)/nrow(data))*nrow(YY_B_2)/nrow(data)
## [1] 0.1124978
Chi-Square test for association. We recieve a p value of 0.00000000000000022. Therefore, we reject the null hypothesis. The values are significant and therefore our data is independent.
require(MASS)
## Loading required package: MASS
chi_table<- table(data$TaxiOut, data$DepDelay)
chisq.test(chi_table)
##
## Pearson's Chi-squared test
##
## data: chi_table
## X-squared = 134320, df = 60634, p-value < 2.2e-16
Variable Density Plots
require(psych)
## Loading required package: psych
describe(data)
## vars n mean sd median trimmed mad min max range skew
## TaxiOut 1 224549 15.09 7.74 14 14.12 5.93 1 163 162 3.26
## DepDelay 2 224549 9.44 28.76 0 3.36 5.93 -33 981 1014 5.96
## kurtosis se
## TaxiOut 26.99 0.02
## DepDelay 71.73 0.06
#Create density plot for TaxiOut variable
d_TaxiOut <- density(data$TaxiOut)
plot(d_TaxiOut, main="Taxi Out Probabilities", ylab="Probability", xlab="Departure Delay (in minutes)")
polygon(d_TaxiOut, col="red")
abline(v = TaxiOut.median, col = "green", lwd = 2)
abline(v = TaxiOut.mean, col = "blue", lwd = 2)
abline(v = TaxiOut.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8)
d_TaxiOut <- density(data$TaxiOut)
#Create density plot for DepDelay variable.
#Reduced the data for the plot to cut-off at X < 100 for clarity.
DepDelay_for_graph <- subset(data, DepDelay < 100)
DepDelay_for_graph <- density(DepDelay_for_graph$DepDelay, na.rm=TRUE)
plot(DepDelay_for_graph, main="Departure Delay Probabilities", ylab="Probability", xlab="Departure Delay (in minutes)")
polygon(DepDelay_for_graph, col="red")
abline(v = DepDelay.median, col = "green", lwd = 2)
abline(v = DepDelay.mean, col = "blue", lwd = 2)
abline(v = DepDelay.mode, col = "purple", lwd = 2)
legend("topright", legend=c("median", "mean","mode"),col=c("green", "blue", "purple"), lty=1, cex=0.8, )
#QQ plots
Scatterplot
plot(data$TaxiOut,data$DepDelay, main="TaxiOut vs Delay Scatterplot",
xlab="Taxi Out (Time)", ylab="Departure Delay (Time)", pch=19)
Provide a 95% Confidence Interval for the difference in the mean of the variables. With a Two Sample t-test we recieve 5.531988 5.778389
t.test(data$TaxiOut,data$DepDelay)
##
## Welch Two Sample t-test
##
## data: data$TaxiOut and data$DepDelay
## t = 89.967, df = 256900, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 5.531988 5.778389
## sample estimates:
## mean of x mean of y
## 15.091098 9.435909
Derive a correlation matrix for two of the quantitative variables you selected. We see no linear relationship between the data. Furthermore, a correlation matrix shows low correlation coefficient values between the data.
corMatrix<-cor(data)
corMatrix
## TaxiOut DepDelay
## TaxiOut 1.000000000 0.002026774
## DepDelay 0.002026774 1.000000000
Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. With another Two Sample t-test we recieve 5.493275 5.817102
t.test(data$TaxiOut,data$DepDelay, conf.level=0.99)
##
## Welch Two Sample t-test
##
## data: data$TaxiOut and data$DepDelay
## t = 89.967, df = 256900, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## 5.493275 5.817102
## sample estimates:
## mean of x mean of y
## 15.091098 9.435909
Invert your correlation matrix (Precision matrix). The diagonal elements represent variance inflation factors, which measures the relationship between combinations between variables.
require(MASS)
corMatrixInverse <- ginv(corMatrix)
corMatrixInverse
## [,1] [,2]
## [1,] 1.000004108 -0.002026782
## [2,] -0.002026782 1.000004108
Multiply the correlation matrix by the precision matrix. This is the identity Matrix
matrix1<- corMatrixInverse %*% corMatrix
matrix1
## TaxiOut DepDelay
## [1,] 1.000000e+00 -1.687019e-16
## [2,] 1.084202e-16 1.000000e+00
Then multiply the precision matrix by the correlation matrix. This represents how matrix products differ depending on the order or direction by which they are multiplied.
matrix2<- corMatrix %*% corMatrixInverse
matrix2
## [,1] [,2]
## TaxiOut 1.000000e+00 -1.691355e-16
## DepDelay 1.088539e-16 1.000000e+00
We take the right-skewed TaxiOut data and fit it to an exponential function
#Using the MASS library
#Calculate the optimal lambda for right-skewed TaxiOut
lambda<-fitdistr(data$TaxiOut,"exponential")
lambda$estimate
## rate
## 0.06626423
#Transpose the rate into 1000 selected variables as an exponential distribution
pdf_distr<-rexp(1000,lambda$estimate)
#Plot the results of the exponential distribution
hist(pdf_distr, freq = FALSE, breaks = 100, main ="Fitted Exponential PDF with TaxiOut data", xlim = c(1, quantile(pdf_distr, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)
#Plot the results as compared to the original data
hist(data$TaxiOut, freq = FALSE, breaks = 100, main ="Comparison with original TaxiOut data",xlim = c(1, quantile(data$TaxiOut, 0.99)))
curve(dexp(x, rate = lambda$estimate), col = "red", add = TRUE)
With the exponential PDF:
5th and 95th percentiles using the cumulative distribution function (CDF)
qexp(0.05, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 0.7740721
qexp(0.95, rate = lambda$estimate, lower.tail = TRUE, log.p = FALSE)
## [1] 45.20889
95% confidence interval from the empirical data, assuming normality
qnorm(0.95,TaxiOut.mean, TaxiOut.sd)
## [1] 27.82288
Empirical 5th percentile and 95th percentile of the data
quantile(data$TaxiOut, c(.05, .95))
## 5% 95%
## 7 28
These exercises help us recognize the differences between a exponential equation and our right-tailed data. We are able to find an approximation and likely a connection between an exponential function, but only after the mean, and therefore this is a noticeably incomplete representation.