DS Math Final

I am using the hflights dataset.
Loading dataset and getting basic summary statistics

library(hflights)
library(ggplot2)
library(MASS)
library(corrplot)
head(hflights)

##      Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011     1          1         6    1400    1500            AA
## 5425 2011     1          2         7    1401    1501            AA
## 5426 2011     1          3         1    1352    1502            AA
## 5427 2011     1          4         2    1403    1513            AA
## 5428 2011     1          5         3    1405    1507            AA
## 5429 2011     1          6         4    1359    1503            AA
##      FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424       428  N576AA                60      40      -10        0    IAH
## 5425       428  N557AA                60      45       -9        1    IAH
## 5426       428  N541AA                70      48       -8       -8    IAH
## 5427       428  N403AA                70      39        3        3    IAH
## 5428       428  N492AA                62      44       -3        5    IAH
## 5429       428  N262AA                64      45       -7       -1    IAH
##      Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424  DFW      224      7      13         0                         0
## 5425  DFW      224      6       9         0                         0
## 5426  DFW      224      5      17         0                         0
## 5427  DFW      224      9      22         0                         0
## 5428  DFW      224      9       9         0                         0
## 5429  DFW      224      6      13         0                         0

flights <-hflights [, c('ArrDelay', 'DepDelay')]
flights <- flights[complete.cases(flights), ]


head (flights)

##      ArrDelay DepDelay
## 5424      -10        0
## 5425       -9        1
## 5426       -8       -8
## 5427        3        3
## 5428       -3        5
## 5429       -7       -1

names(flights)[1]<-"y"
names(flights)[2]<-"x"

head(flights)

##        y  x
## 5424 -10  0
## 5425  -9  1
## 5426  -8 -8
## 5427   3  3
## 5428  -3  5
## 5429  -7 -1

summary(flights$y)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -70.000  -8.000   0.000   7.094  11.000 978.000

summary(flights$x)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -33.000  -3.000   0.000   9.415   9.000 981.000

Plotting histograms to see what we have

ggplot(data=flights, aes(flights$x)) + geom_histogram(binwidth = 1)  + xlim ( -50, 100) +ggtitle( "flights$x departures")

ggplot(data=flights, aes(flights$y)) + geom_histogram(binwidth = 1)  + xlim ( -50, 100) + ggtitle ("flights$y arrivals")

Getting Quantiles

quantile(flights$y)

##   0%  25%  50%  75% 100% 
##  -70   -8    0   11  978

quantile(flights$x)

##   0%  25%  50%  75% 100% 
##  -33   -3    0    9  981

y <- quantile(flights$y)[3]
x <- quantile(flights$x)[4]

P(X>x | Y>y)

a <-  length(flights$x[which(flights$x > x)] )/length(flights$x)
b <-  length(flights$y[which(flights$y > y)] )/length(flights$y)

(a[[1]]*b[[1]])/a[[1]]

## [1] 0.4775901

P(X>x, Y>y)

a <-  length(flights$x[which(flights$x > x)] )/length(flights$x)
b <-  length(flights$y[which(flights$y > y)] )/length(flights$y)
a[[1]]*b[[1]]

## [1] 0.1177133

P(X < x | Y >y )

a <-  length(flights$x[which(flights$x < x)] )/length(flights$x)
b <-  length(flights$y[which(flights$y > y)] )/length(flights$y)
(a[[1]]*b[[1]])/a[[1]]

## [1] 0.4775901

Yes P(AB) = P(A|B), there for the 2 variables, departure delay and arrival delay are dependant. No Splitting them does not make them independant

y <- quantile(flights$y)[3]
x <- quantile(flights$x)[4]

cell1a <- length(flights$x[which(flights$x <= x)] )/ length(  flights$y[which(flights$y <= y)] )  

cell1b <- length(flights$x[which(flights$x <= x)])/ length(flights$y[which(flights$y > y)]) 

cell2a <-  length(flights$x[which(flights$x > x)]) / length(flights$y[which(flights$y <= y)]) 

cell2b <-   length(flights$x[which(flights$x > x)]) /  length(flights$y[which(flights$y > y)]) 

cell1c <- cell1a + cell1b
cell2c <- cell2a+cell2b

t1 <- cell1a+cell2a
t2 <- cell2b+cell1b

t3 <- t1+t2

x/y	<= 2dquartile	> 2d Quartile	total
<=3d Quartile	1.4424047	1.5777684	3.0201731
>3d quartile	0.4718009	0.5160774	0.9878783
Total	1.9142056	2.0938459	4.0080515

This chart is showing us that “X” is a lot more focused around the mead than “y”.

Preforming chi squared

minX <- abs( min(flights$x))
minY <- abs(min(flights$y))
flights$x <- flights$x + minX +1
flights$y <- flights$y+minY + 1

chisq.test(flights)

## 
##  Pearson's Chi-squared test
## 
## data:  flights
## X-squared = 360110, df = 223870, p-value < 2.2e-16

flights$x <- flights$x - minX -1
flights$y <- flights$y - minY - 1

The p value is extremely small therefore there is a negligible chance that the departure delay and arrival delay are independent

Descriptive and Inferential Statistics. Provide univariate descriptive statistics and appropriate plots. Provide a scatterplot of the two variables. Provide a 95% CI for the difference in the mean of the variables. pa Test the hypothesis that the correlation between these variables is 0 and provide a 99% confidence interval. Discuss the meaning of your analysis.

ggplot(flights, aes(x=flights$x, y=flights$y)) +
    geom_point(shape=1)  +  geom_smooth(method=lm)

95% interval

meanDif <- mean(flights$y) - mean(flights$x)
ci <- qnorm(1-(.05/2) )+ sqrt( (sd(flights$y)/length(flights$y)) - (sd(flights$x)/length(flights$x)  ) )
meanDif+ci

## [1] -0.3577213

meanDif-ci

## [1] -4.283577

Get Correlation Matrix

M <- cor( flights)
corrplot(M, method="circle")

They are correlated becuase correlation is close to 1.

Check to see if correlation is not 0

cor.test (flights$x, flights$y, conf.level  = .99)

## 
##  Pearson's product-moment correlation
## 
## data:  flights$x and flights$y
## t = 1189.8, df = 223870, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.9284710 0.9299578
## sample estimates:
##       cor 
## 0.9292181

Since p value is very small there is very little likelyhood that this correlation is based on chance

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

##           y         x
## y 1.0000000 0.9292181
## x 0.9292181 1.0000000

Minv = solve(M)
Minv

##           y         x
## y  7.323130 -6.804785
## x -6.804785  7.323130

M%*%Minv

##   y x
## y 1 0
## x 0 1

Since the diagonal of the covariance matrix is positive there is a positive relationship between x and y. Therefore, the more a departure is delayed the more its associated arrival is delayed.

For your variable that is skewed to the right, shift it so that the minimum value is above zero. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of l for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, l)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

flights$x  <- flights$x + abs(min(flights$x)) + 1
lambda <- fitdistr(x, "exponential")

lambda <-lambda[[1]]
lambda

##      rate 
## 0.1111111

exValues <- data.frame(x = rexp(10000, 2.303352e-02 ))  - abs(min(flights$x)) - 1
flights$x <-flights$x - abs(min(flights$x)) - 1

ggplot(data=flights, aes(flights$x)) + geom_histogram(binwidth = 1)   + xlim(-30, 100) + ggtitle("observed data")

## Warning: Removed 8487 rows containing non-finite values (stat_bin).

ggplot(data=exValues, aes(exValues$x)) + geom_histogram(binwidth = 1)  + xlim ( -30, 100 )+ ggtitle("exponential  regression")

## Warning: Removed 960 rows containing non-finite values (stat_bin).

The shape the gamma distribution looks much more like the shape of the original flights data than the exponential. I am going to use the gamma distribution for the confidence intervals below.

Getting 95% confidence interval from gamma exp

qexp(.05,rate = lambda)

## [1] 0.4616396

qexp(.05, rate =  lambda, lower.tail = FALSE )

## [1] 26.96159

Getting 95% Confindece interval from normal distribution

ci <- qt(1-(.95/2),df=length(flights$x)-1)*sd(flights$x)/sqrt(length(flights$x))
mean(flights$x)+ci

## [1] 41.41879

mean(flights$x) - ci

## [1] 41.41117

Getting 95th percentile from actual data

quantile(flights$x, .05 )

## 5% 
## 25

quantile(flights$x, .95 )

## 95% 
##  88

The 95% confidence interval from the from the normal distribution is not that accurate, because the left side of a normal dist does not go nearly as negative as the right side goes positive therefore the standard deviation smaller than it should be.

The confidence intervals for the exp regression is closer than from the normal assumption, but it does not account for all of the long poistive tail or any negative values.

DS Math Final

Raphael Nash

August 7, 2016