This project will show off your ability to understand probability, statistics, linear algebra, and R.
Using the hflights package or another large (greater than 100,000 observations and 20 variables) dataset of your choosing, pick two quantitative variables (e.g., Arrival Delay and Departure Delay). Define one of these variables as the random variable X and the other as the random variable Y. One of these variables must be skewed to the right (positively)
library("ggplot2")
library("hflights")
library("MASS")
flightDF <- subset(hflights,select=c(ArrDelay, DepDelay))
flightDF <- na.omit(flightDF)
x <- quantile(flightDF$DepDelay, .75)
y <- quantile(flightDF$ArrDelay, .50)
x_probability_less_or_equal <- sum(flightDF$DepDelay <= x) / length(flightDF$DepDelay)
x_probability_greater <- sum(flightDF$DepDelay > x) / length(flightDF$DepDelay)
y_probability_less_or_equal <- sum(flightDF$ArrDelay <= y) / length(flightDF$ArrDelay)
y_probability_greater <- sum(flightDF$ArrDelay > y) / length(flightDF$ArrDelay)
a_probability <- (x_probability_greater * y_probability_greater) / y_probability_greater
a_probability
## [1] 0.2464735
Using formula P(X | Y) = P(X and Y) / P(Y), the probability of > x given > y is 0.2464735.
b_probability <- x_probability_greater *
((x_probability_greater * y_probability_greater) / x_probability_greater)
b_probability
## [1] 0.1177133
Using formula P(X and Y) = P(X) * P(Y | X), the probability of both x and y is 0.1177133.
c_probability <- (x_probability_less_or_equal * y_probability_less_or_equal) /
y_probability_less_or_equal
c_probability
## [1] 0.7535265
Using formula P(X | Y) = P(X and Y) / P(Y), the probability of < x given < y is 0.7535265.
table_of_probabilities <- matrix(c(
x_probability_less_or_equal * y_probability_less_or_equal,
x_probability_greater * y_probability_less_or_equal,
x_probability_less_or_equal * y_probability_less_or_equal +
x_probability_greater * y_probability_less_or_equal,
x_probability_less_or_equal * y_probability_greater,
x_probability_greater * y_probability_greater,
x_probability_less_or_equal * y_probability_greater +
x_probability_greater * y_probability_greater,
x_probability_less_or_equal * y_probability_less_or_equal +
x_probability_less_or_equal * y_probability_greater,
x_probability_greater * y_probability_less_or_equal +
x_probability_greater * y_probability_greater,
x_probability_less_or_equal * y_probability_less_or_equal +
x_probability_less_or_equal * y_probability_greater +
x_probability_greater * y_probability_less_or_equal +
x_probability_greater * y_probability_greater
),nrow = 3)
colnames(table_of_probabilities) <- c("<=2d quantile",">2d quantile","Col Total")
rownames(table_of_probabilities) <- c("<=3d quantile",">3d quantile","Row Total")
table_of_probabilities <- as.table(table_of_probabilities)
table_of_probabilities
## <=2d quantile >2d quantile Col Total
## <=3d quantile 0.3936497 0.3598768 0.7535265
## >3d quantile 0.1287602 0.1177133 0.2464735
## Row Total 0.5224099 0.4775901 1.0000000
Yes it seems that splitting them makes X and Y independent events.
A_probability_greater <- sum(flightDF$DepDelay > x) / length(flightDF$DepDelay)
A_probability_greater
## [1] 0.2464735
B_probability_less_or_equal <- sum(flightDF$ArrDelay <= y) / length(flightDF$ArrDelay)
B_probability_less_or_equal
## [1] 0.5224099
A_given_B_probability <- nrow(subset(flightDF, DepDelay > x & ArrDelay <= y)) / nrow(flightDF)
A_given_B_probability
## [1] 0.0170319
Are_Probabilities_Equal <- (A_given_B_probability == (A_probability_greater * B_probability_less_or_equal))
Are_Probabilities_Equal
## [1] FALSE
Mathematically, they are not equal since P(A|B) is 0.0170319, while P(A)(B) is 0.1287602.
flightDF_cor <- subset(hflights, ArrDelay > 0 & DepDelay > 0, select=c(ArrDelay, DepDelay))
flightDF_cor <- na.omit(flightDF_cor)
chisq.test(flightDF_cor)
## Warning in chisq.test(flightDF_cor): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: flightDF_cor
## X-squared = 247640, df = 78807, p-value < 2.2e-16
Running the chi-squared test, the p-value is less than 0.05, suggesting they are not independent.
## ArrDelay DepDelay
## Min. :-70.000 Min. :-33.000
## 1st Qu.: -8.000 1st Qu.: -3.000
## Median : 0.000 Median : 0.000
## Mean : 7.094 Mean : 9.415
## 3rd Qu.: 11.000 3rd Qu.: 9.000
## Max. :978.000 Max. :981.000
It seems to suggest a strong positive relationship between Departure delays and Arrival delays
t.test(flightDF$DepDelay,flightDF$ArrDelay, conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: flightDF$DepDelay and flightDF$ArrDelay
## t = 26.106, df = 445800, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.146418 2.494880
## sample estimates:
## mean of x mean of y
## 9.414983 7.094334
The 95% Ci for the difference in the mean of the variables is between 2.146418 and 2.494880
cor(flightDF)
## ArrDelay DepDelay
## ArrDelay 1.0000000 0.9292181
## DepDelay 0.9292181 1.0000000
cor.test(flightDF$DepDelay,flightDF$ArrDelay, conf.level = 0.99)
##
## Pearson's product-moment correlation
##
## data: flightDF$DepDelay and flightDF$ArrDelay
## t = 1189.8, df = 223870, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
## 0.9284710 0.9299578
## sample estimates:
## cor
## 0.9292181
Rejecting the null hypothesis since a strong correlation exists with the 99% Confidence interval between 0.9284710 and 0.9299578
cor_matrix <- cor(flightDF)
solve(cor_matrix)
## ArrDelay DepDelay
## ArrDelay 7.323130 -6.804785
## DepDelay -6.804785 7.323130
cor_matrix %*% solve(cor_matrix)
## ArrDelay DepDelay
## ArrDelay 1 0
## DepDelay 0 1
cor_matrix %*% solve(cor_matrix)
## ArrDelay DepDelay
## ArrDelay 1 0
## DepDelay 0 1