#install.packages("hflights")

#loading requird packages
require (hflights)

## Loading required package: hflights

require (ggplot2)

## Loading required package: ggplot2

head (hflights)

##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

Define Variables

#converting all the na values to zero for consistancy
hflights$ArrDelay[ is.na( hflights$ArrDelay ) ] <- 0
hflights$DepDelay[ is.na( hflights$DepDelay ) ] <- 0

X <- hflights$ArrDelay
Y <- hflights$DepDelay

Generate summary level descriptive statistics

summary (X)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -70.000  -8.000   0.000   6.981  11.000 978.000

summary (Y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -33.000  -3.000   0.000   9.324   9.000 981.000

Xtotal <- length(X) - 3622
Xtotal

## [1] 223874

Ytotal <- length(Y) - 2905
Ytotal

## [1] 224591

Density plot for X and Y

ggplot(hflights, aes(X)) +
  geom_density()

ggplot(hflights, aes(Y)) +
  geom_density()

‘x’ as 3d quartile of ‘X’ and ‘y’ as 2d quartile ‘Y’

x <- quantile(X, .75)
x

## 75% 
##  11

y <- quantile(Y, .5)
y

## 50% 
##   0

Table of counts

a <- nrow(subset(hflights, X <= x & Y <= y));
a;

## [1] 111341

b <- nrow(subset(hflights, X <= x & Y > y));
b;

## [1] 61448

c <- nrow(subset(hflights, X > x & Y <= y));
c;

## [1] 6159

d <- nrow(subset(hflights, X > x & Y > y));
d;

## [1] 48548

values <- matrix(c(a,b,(a+b),c,d,(c+d),(a+c),(b+d),(a+b+c+d)),ncol=3, nrow=3,byrow=TRUE)
colnames(values) <- c("<=2d quartile",">2d quartile","Total")
rownames(values) <- c('<=3d quartile', '>3d quartile','Total')
values.table <- as.table(values)
values.table

##               <=2d quartile >2d quartile  Total
## <=3d quartile        111341        61448 172789
## >3d quartile           6159        48548  54707
## Total                117500       109996 227496

a. P(X>x | Y>y) b. P(X>x, Y>y) c. P(Xy)

# a. P(X>x | Y>y)
48548/109996

## [1] 0.4413615

# b.  P(X>x, Y>y)
48548/227496

## [1] 0.2134016

#c.  P(X<x | Y>y)
(nrow(subset(hflights, X < x & Y > y)))/(nrow(subset(hflights, Y > y)))

## [1] 0.5382196

Does P(A | B) = P(A) * P(B)?

#P(A | B) = P(X>x | Y>y)
48548/109996

## [1] 0.4413615

#P(A)
A <- nrow(subset(hflights, X > x))
A

## [1] 54707

pofA <- A/nrow(hflights)
pofA

## [1] 0.2404746

#P(B)
B <- nrow(subset(hflights, Y > y))
B

## [1] 109996

#P(A | B) is not equal to P(A) * P(B)
pofB <- B/nrow(hflights)
pofB

## [1] 0.4835074

#P(A) * P(B)
p <- pofA * pofB
p

## [1] 0.1162712

# P(A | B) = .4413 in not equal to P(A) * P(B) = .1163(X and Y variable are not independent)

#Chi Squere Test for association

chisq.test(X,Y)

## Warning in chisq.test(X, Y): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  X and Y
## X-squared = 25198000, df = 197740, p-value < 2.2e-16

#We can see form the Chi Square test the p value is smaller than significance level of 0.05, we should reject the hypothesis that X (Arrival delay) and Y Departure dealy are independent

scatterplot of the two variables

qplot(X, Y, main = 'Relationship between Arrival dealay and Departure dalay',
      xlab = 'Arrival Delay', ylab = 'Departure Delay')

#Scatter plot indicates strong positive relationship between the two variable

95% confidence interval for the difference in the mean

t.test(X,Y)

## 
##  Welch Two Sample t-test
## 
## data:  X and Y
## t = -26.722, df = 453240, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.514811 -2.171108
## sample estimates:
## mean of x mean of y 
##  6.981384  9.324344

#95% confidence interval for the difference in mean -2.514811 to -2.171108. Values are negative because the mean of random variable X(6.981384) is smaller then mean of random variable Y(9.324344)

correlation matrix for two of the quantitative variables

data <- cbind(X,Y)
matrix <- cor(data, use = "complete")
matrix

##           X         Y
## X 1.0000000 0.9254458
## Y 0.9254458 1.0000000

Hypothesis test to see if correlation between these variables is 0 and provide a 99% confidence interval

cor.test(X,Y,conf.level=0.99)

## 
##  Pearson's product-moment correlation
## 
## data:  X and Y
## t = 1165, df = 227490, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.9246667 0.9262172
## sample estimates:
##       cor 
## 0.9254458

# 99% confidence interval is 0.924667 to 0.9262172. Correlation between X and Y is not zero, therefore the variables are not independent. Correlation coeffieient of 0.9254458 points to strong dependance.

Invert the Correlation Matrix to Create the ‘Precision Matrix’

pmatrix <- solve(matrix)
pmatrix

##           X         Y
## X  6.966212 -6.446851
## Y -6.446851  6.966212

Multiply the Correlation Matrix by the Precision Matrix

matrix%*%pmatrix

##              X Y
## X 1.000000e+00 0
## Y 8.881784e-16 1

Multiply the Precision Matrix by the Correlation Matrix

pmatrix%*%matrix

##   X            Y
## X 1 8.881784e-16
## Y 0 1.000000e+00

#The results of both multiplication are inverse of each other

shift X so that the minimum value is above zero

Xshift  <- X+ abs(min(X)) + 1
summary (Xshift)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   63.00   71.00   77.98   82.00 1049.00

Load the MASS package

require(MASS)

## Loading required package: MASS

library(MASS)

run fitdistr to fit an exponential probability density function

Xexpo <- fitdistr(Xshift, "exponential")
Xexpo

##        rate    
##   1.282357e-02 
##  (2.688575e-05)

Optimal value of lambda for the distribution

1 / Xexpo$estimate

##     rate 
## 77.98138

1000 samples from the exponential distribution

sampex <- rexp(1000, Xexpo$estimate)
summary(sampex)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.1232  22.0300  54.0300  77.4200 107.4000 504.6000

Comparison of histogram of expotential sample with a histogram of original variable

qplot(sampex, geom="histogram", binwidth = 5, fill=I("blue"), col=I("black"))

qplot(X, geom="histogram", binwidth = 5, fill=I("blue"), col=I("black"))

#The expotential sample appears to be less right skwed then the original

5th Percentile

qexp(.05, rate = Xexpo$estimate, lower.tail = TRUE, log.p = FALSE)

## [1] 3.999922

95th Percentile

qexp(.95, rate = Xexpo$estimate, lower.tail = TRUE, log.p = FALSE)

## [1] 233.6113

95% confidence interval from the empirical data, assuming normality

meanX <- mean(na.omit(X)) # get the mean for X
meanX

## [1] 6.981384

sdX <- sqrt(var(na.omit(X)))
sdX

## [1] 30.47602

qnorm(.025, mean = meanX, sd = sdX)

## [1] -52.75051

qnorm(.975, mean = meanX, sd = sdX)

## [1] 66.71328

#If X is normally distributed 95% of the sample mean will be between -52.75051 and 66.71328

The empirical 5th percentile and 95th percentile of the data

#.05 quantile 
quantile(X, .05, na.rm = TRUE)

##  5% 
## -18

#.95 quantile
quantile(X, .95, na.rm = TRUE)

## 95% 
##  56

#GThe empirical 5th percentile and 95th percentil of the implies 90% of the sample X from original data falls between -18 and 56

Bridge Math Final

Ahsanul Choudhury

August 10, 2016

Define Variables

Generate summary level descriptive statistics

Density plot for X and Y

‘x’ as 3d quartile of ‘X’ and ‘y’ as 2d quartile ‘Y’

Table of counts

a. P(X>x | Y>y) b. P(X>x, Y>y) c. P(Xy)

Does P(A | B) = P(A) * P(B)?

scatterplot of the two variables

95% confidence interval for the difference in the mean

correlation matrix for two of the quantitative variables

Hypothesis test to see if correlation between these variables is 0 and provide a 99% confidence interval

Invert the Correlation Matrix to Create the ‘Precision Matrix’

Multiply the Correlation Matrix by the Precision Matrix

Multiply the Precision Matrix by the Correlation Matrix

shift X so that the minimum value is above zero

Load the MASS package

run fitdistr to fit an exponential probability density function

Optimal value of lambda for the distribution

1000 samples from the exponential distribution

Comparison of histogram of expotential sample with a histogram of original variable

5th Percentile

95th Percentile

95% confidence interval from the empirical data, assuming normality

The empirical 5th percentile and 95th percentile of the data