R Markdown

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(truncnorm)
## Warning: package 'truncnorm' was built under R version 4.1.2
library(extraDistr)
## Warning: package 'extraDistr' was built under R version 4.1.2
## 
## Attaching package: 'extraDistr'
## The following object is masked from 'package:purrr':
## 
##     rdunif
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 4.1.2
#setup
setwd("C:/Users/sweeneys/OneDrive - Michigan Medicine/Desktop/STATS 406/Project")
SFO <- read.csv("SFO Dataset.csv")
#SFOnew<- SFO[which(SFO$RATESFO >0 & SFO$RATESFO <6),]
SFOclean<- SFO[which(SFO$PASSTHRU >0 & SFO$PASSTHRU <6),]
SFOgroup<- SFOclean[which(SFOclean$GROUP > 0),]
SFOgroup1<- SFOgroup[which(SFOgroup$GROUP == 1),]

SFOpre<- SFO[which(SFO$POST == 0),]
SFOpost<- SFO[which(SFO$POST == 1),]

SFOGp1 <- SFOclean[which(SFOclean$GROUP == 1),]
SFOGp2 <- SFOclean[which(SFOclean$GROUP == 2),]
SFOGp3 <- SFOclean[which(SFOclean$GROUP == 3),]
library(ggplot2)

#hist(SFOGp1$PASSTHRU)
#hist(SFOGp2$PASSTHRU)
#hist(SFOGp3$PASSTHRU)

#counts<-table(SFOclean$PASSTHRU)
#barplot(counts)

counts2<-table(SFOgroup$GROUP, SFOgroup$PASSTHRU)
barplot(counts2, legend = rownames(counts2), beside=TRUE)

#ggplot() + geom_histogram(data=SFOclean, aes(x=PASSTHRU, y=..density..), binwidth = 1) + stat_function(fun = dnorm, args = list(mean = 5, sd = 0.9),col = "#1b98e0",size = 2)

#ggplot() + geom_histogram(data=SFOclean, aes(x=PASSTHRU, y=..density..), binwidth = 1) + stat_function(fun = dnorm, args = list(mean = 5, sd = 0.8),col = "#1b98e0",size = 2)


rate<-1:5
datpois<-dtpois(rate,lambda = 5, a=1, b=5)
plot(rate,datpois, type='b')

datpoisl4<-dtpois(rate,lambda = 4, a=1, b=5)


ggplot(data = SFOclean) + geom_histogram(aes(x = PASSTHRU, y = ..density..), binwidth = 1, bins = 5, col = "black", fill = "grey") + geom_point(aes(x=1, y=datpois[1]), color="red") + geom_point(aes(x=2, y=datpois[2]), color="red") + geom_point(aes(x=3, y=datpois[3]), color="red") + geom_point(aes(x=4, y=datpois[4]), color="red") + geom_point(aes(x=5, y=datpois[5]), color="red")

ggplot(data = SFOgroup1) + geom_histogram(aes(x = PASSTHRU, y = ..density..), binwidth = 1, bins = 5, col = "black", fill = "grey") + geom_point(aes(x=1, y=datpois[1]), color="red") + geom_point(aes(x=2, y=datpois[2]), color="red") + geom_point(aes(x=3, y=datpois[3]), color="red") + geom_point(aes(x=4, y=datpois[4]), color="red") + geom_point(aes(x=5, y=datpois[5]), color="red")

#hist(SFO$RATESFO)

boxplot(SFOnew$RATESFO ~ SFOnew$TERM)

boxplot(SFOnew$RATESFO ~ SFOnew$GENDER)

boxplot(SFOnew$RATESFO ~ SFOnew$AGE)

SFOtable<- table(SFOnew$YEAR, SFOnew$RATESFO)
barplot(SFOtable, main="Overall Satisfaction with SFO by year", legend.text=rownames(SFOtable))

#sat.year<-aov (RATESFO ~ YEAR, data=SFOnew)

#summary(sat.year)

#length(SFOnew$POST)
#length(SFOnew$RATESFO)

chisq.test(SFOnew$POST,SFOnew$RATESFO)

chisq.test(SFOpre$YEAR,SFOpre$RATESFO)
chisq.test(SFOpost$YEAR,SFOpost$RATESFO)

SIMULATION DISTRIBUTION - RV CREATION

B=1000
n=length(SFOgroup$GROUP)
p1<-data.frame(replicate(B,rtpois(B,5, a=1, b=5)))
p2<-data.frame(replicate(B,rtpois(B,5, a=1, b=5)))
p3<-data.frame(replicate(B,rtpois(B,4, a=1, b=5)))

#pt1<-data.frame(replicate(B,rt(B,df=1,ncp=5)))
#pt2<-data.frame(replicate(B,rt(B,df=1,ncp=5)))
#pt3<-data.frame(replicate(B,rt(B,df=1,ncp=3)))

ptn1<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)))
ptn2<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)))
ptn3<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=5, sd=1.3)))
null_samples<-p1
alt_samples<-p3

stat_1<-function(x){
  mean(x)
}

ts_0 <-map_dbl(null_samples, stat_1)
rr<-quantile(ts_0,0.95)
mean(ts_0>rr)
## [1] 0.05
ts_1 <-map_dbl(alt_samples,stat_1)
mean(ts_1>rr)
## [1] 0
lambda_power<-seq(1,5,length.out=20)

power_mom <- map_dbl(lambda_power,function(lambda){
    altmom<-data.frame(replicate(B, { rpois(B,lambda)}))
    ts_mom<-map_dbl(altmom,stat_1)
    mean(ts_mom>rr)
})

pp1<-data.frame(lambda_power,power_mom)
ggplot(data=pp1,aes(x=lambda_power,y=power_mom)) + geom_line() + geom_point()

null_samples_tn<-ptn1
alt_samples_tn<-ptn3
stat_tn<-function(x){
  mean(x)
}

ts_0_tn <-map_dbl(null_samples_tn, stat_tn)
rr_tn<-quantile(ts_0_tn,0.95)
mean(ts_0_tn>rr_tn)
## [1] 0.05
ts_1_tn <-map_dbl(alt_samples_tn,stat_tn)
mean(ts_1_tn>rr_tn)
## [1] 1
tn_power<-seq(4,5,length.out=20)

power_tn_mean <- map_dbl(tn_power,function(t){
    alt<-data.frame(replicate(1000, {rtruncnorm(B, a=1, b=5, mean=t, sd=1.6)}))
    ts_tn<-map_dbl(alt,stat_tn)
    mean(ts_tn>rr_tn)
})

pptn<-data.frame(tn_power,power_tn_mean)
ggplot(data=pptn,aes(x=tn_power,y=power_tn_mean)) + geom_line() + geom_point()

MONTE CARLO SIMULATIONS

p1means<-map_dbl(p1,mean)
p2means<-map_dbl(p2,mean)
p3means<-map_dbl(p3,mean)


d1<-p1means-p2means
mean(d1)
## [1] -0.002102
quantile(d1,c(0.025,0.975))
##      2.5%     97.5% 
## -0.092025  0.089000
#rr1<-quantile(d1,0.95)
#mean(d1>rr1) # difference of mean - type 1 (one-sided)

d2<-p1means-p3means
mean(d2)
## [1] 0.247398
quantile(d2,c(0.025,0.975))
##     2.5%    97.5% 
## 0.152975 0.338025
#mean(d2>rr1) # difference of mean - power (one-sided)


rr2<-quantile(d1,0.975)
rr3<-quantile(d1,0.025)

mean(d1>rr2) + mean(d1<rr3)# difference of mean - type 1
## [1] 0.049
binom.test((sum(d1>rr2)+sum(d1<rr3)),B,conf.level = 0.95)$conf.int
## [1] 0.03646706 0.06426605
## attr(,"conf.level")
## [1] 0.95
mean(d2>rr2) + mean(d2<rr3) # difference of mean - power
## [1] 0.999
binom.test((sum(d2>rr2)+sum(d2<rr3)),B,conf.level = 0.95)$conf.int
## [1] 0.9944411 0.9999747
## attr(,"conf.level")
## [1] 0.95
pt.wmw<-function(n,m,lambda1, lambda2){
fdata<-rtpois(n,lambda1, a=1, b=5)
gdata<-rtpois(m,lambda2, a=1, b=5)
pval.wmw<-wilcox.test(fdata,gdata, alternative="two.sided")$p.value
if(pval.wmw<0.05) 1 else 0
}

pt1.wmw<-replicate(B,pt.wmw(1000,1000,5,5))
est.pti.wmw<-sum(pt1.wmw)/B
est.pti.wmw # wilcox - type 1
## [1] 0.047
binom.test(sum(pt1.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.03473506 0.06201263
## attr(,"conf.level")
## [1] 0.95
pt2.wmw<-replicate(B,pt.wmw(1000,1000,5,4))
est.pt2.wmw<-sum(pt2.wmw)/B
est.pt2.wmw  # wilcox - power
## [1] 0.999
binom.test(sum(pt2.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.9944411 0.9999747
## attr(,"conf.level")
## [1] 0.95
#TRUNCATED NORMAL

ptn1means<-map_dbl(ptn1,mean)
ptn2means<-map_dbl(ptn2,mean)
ptn3means<-map_dbl(ptn3,mean)

dtn1<-ptn1means-ptn2means
mean(dtn1)
## [1] -0.0004584533
quantile(dtn1,c(0.025,0.975))
##        2.5%       97.5% 
## -0.08351339  0.07877126
#rrtn1<-quantile(dtn1,0.95)
#rrtn1
#mean(dtn1>rrtn1) # difference of mean - type 1 (one-sided)

dtn2<-ptn1means-ptn3means
mean(dtn2)
## [1] -0.3035379
quantile(dtn2,c(0.025,0.975))
##       2.5%      97.5% 
## -0.3776802 -0.2298890
#mean(dtn2>rrtn1) # difference of mean - power (one-sided)


rrtn2<-quantile(dtn1,0.975)
rrtn3<-quantile(dtn1,0.025)

mean(dtn1>rrtn2) + mean(dtn1<rrtn3)# difference of mean - type 1
## [1] 0.05
binom.test((sum(dtn1>rrtn2)+sum(dtn1<rrtn3)),B,conf.level = 0.95)$conf.int
## [1] 0.03733540 0.06539049
## attr(,"conf.level")
## [1] 0.95
mean(dtn2>rrtn2) + mean(dtn2<rrtn3) # difference of mean - power
## [1] 1
binom.test((sum(dtn2>rrtn2)+sum(dtn2<rrtn3)),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95
#rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)

tn.wmw<-function(n,m,mu1,mu2,sd1,sd2){
fdata<-rtruncnorm(n,a=1, b=5,mean = mu1, sd=sd1)
gdata<-rtruncnorm(m,a=1, b=5,mean = mu2, sd=sd2)
pval.wmw<-wilcox.test(fdata,gdata, alternative="two.sided")$p.value
if(pval.wmw<0.05) 1 else 0
}

ptn1.wmw<-replicate(B,tn.wmw(1000,1000,4.7,4.7,1.6,1.6))
est.ptni.wmw<-sum(ptn1.wmw)/B
est.ptni.wmw # wilcox - type 1
## [1] 0.049
binom.test(sum(ptn1.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.03646706 0.06426605
## attr(,"conf.level")
## [1] 0.95
ptn2.wmw<-replicate(B,tn.wmw(1000,1000,4.7,5,1.6,1.3))
est.ptn2.wmw<-sum(ptn2.wmw)/B
est.ptn2.wmw  # wilcox - power
## [1] 1
binom.test(sum(ptn2.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95

DESCRIPTIVE STATISTICS

library(pastecs)
## Warning: package 'pastecs' was built under R version 4.1.2
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
# TSAPRE
SFOgroup$TSAPRE <- as.factor(SFOgroup$TSAPRE)
summary(SFOgroup$TSAPRE)
##     1     2  NA's 
##  4513  6271 12222
tapply(SFOgroup$TSAPRE, SFOgroup$YEAR, function(x) format(summary(x)))
## $`2010`
##      1      2   NA's 
## "   0" "   0" "2808" 
## 
## $`2011`
##      1      2   NA's 
## "   0" "   0" "3354" 
## 
## $`2012`
##      1      2   NA's 
## "   0" "   0" "3004" 
## 
## $`2013`
##      1      2   NA's 
## "   0" "   0" "3056" 
## 
## $`2014`
##      1      2 
## " 779" "1333" 
## 
## $`2015`
##      1      2 
## " 973" "1241" 
## 
## $`2016`
##      1      2 
## " 919" "1373" 
## 
## $`2017`
##      1      2 
## " 906" "1183" 
## 
## $`2018`
##      1      2 
## " 936" "1141"
SFOpost<-SFOgroup[which(SFOgroup$YEAR > 2013),]

counts3<-table(SFOpost$YEAR, SFOpost$TSAPRE)
barplot(counts3, beside=TRUE, main="TSA Pre-Check Usage by Year, 2014-2018")

# PASSTHRU
summary(SFOgroup$PASSTHRU)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   4.000   4.131   5.000   5.000
tapply(SFOgroup$PASSTHRU, SFOgroup$GROUP, function(x) format(summary(x))) #https://stackoverflow.com/questions/9847054/how-to-get-summary-statistics-by-group
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "1.000" "3.000" "4.000" "4.013" "5.000" "5.000" 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "1.000" "4.000" "4.000" "4.153" "5.000" "5.000" 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "1.000" "4.000" "5.000" "4.422" "5.000" "5.000"
stat.desc(SFOgroup$PASSTHRU)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 2.300600e+04 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 4.000000e+00 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 9.504700e+04 4.000000e+00 4.131401e+00 6.820962e-03 1.336954e-02 1.070366e+00 
##      std.dev     coef.var 
## 1.034585e+00 2.504199e-01
tapply(SFOgroup$PASSTHRU, SFOgroup$GROUP, function(x) format(stat.desc(x)))
## $`1`
##        nbr.val       nbr.null         nbr.na            min            max 
## "1.222200e+04" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00" 
##          range            sum         median           mean        SE.mean 
## "4.000000e+00" "4.904800e+04" "4.000000e+00" "4.013091e+00" "9.858877e-03" 
##   CI.mean.0.95            var        std.dev       coef.var 
## "1.932496e-02" "1.187947e+00" "1.089930e+00" "2.715936e-01" 
## 
## $`2`
##        nbr.val       nbr.null         nbr.na            min            max 
## "6.271000e+03" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00" 
##          range            sum         median           mean        SE.mean 
## "4.000000e+00" "2.604100e+04" "4.000000e+00" "4.152607e+00" "1.266507e-02" 
##   CI.mean.0.95            var        std.dev       coef.var 
## "2.482788e-02" "1.005894e+00" "1.002943e+00" "2.415212e-01" 
## 
## $`3`
##        nbr.val       nbr.null         nbr.na            min            max 
## "4.513000e+03" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00" 
##          range            sum         median           mean        SE.mean 
## "4.000000e+00" "1.995800e+04" "5.000000e+00" "4.422335e+00" "1.261997e-02" 
##   CI.mean.0.95            var        std.dev       coef.var 
## "2.474132e-02" "7.187563e-01" "8.477950e-01" "1.917075e-01"
counts2<-table(SFOgroup$GROUP, SFOgroup$PASSTHRU)
barplot(counts2, legend = c("Pre-2014","Post-2014,No Precheck","Post-2014,Precheck"),args.legend = list(x ='topleft', bty='n'), beside=TRUE,main="Ease of Passing through Security Ratings")

#RATE SFO
summary(SFOgroup$RATESFO)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.000   4.000   3.878   4.000   6.000
tapply(SFOgroup$RATESFO, SFOgroup$GROUP, function(x) format(summary(x))) 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "0.000" "4.000" "4.000" "3.866" "4.000" "6.000" 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "0.000" "4.000" "4.000" "3.899" "4.000" "6.000" 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "0.000" "4.000" "4.000" "3.883" "4.000" "6.000"
#ALL
dim(SFOgroup)
## [1] 23006    60
SFOgroup$YEAR<-as.factor(SFOgroup$YEAR)
SFOgroup$GROUP<-as.factor(SFOgroup$GROUP)
table(SFOgroup$YEAR, SFOgroup$GROUP)
##       
##           1    2    3
##   2010 2808    0    0
##   2011 3354    0    0
##   2012 3004    0    0
##   2013 3056    0    0
##   2014    0 1333  779
##   2015    0 1241  973
##   2016    0 1373  919
##   2017    0 1183  906
##   2018    0 1141  936

WILCOXON PAIRWISE COMPARISONS

SFOgroup1<- SFOgroup[which(SFOgroup$GROUP == 1),]
SFOgroup2<- SFOgroup[which(SFOgroup$GROUP == 2),]
SFOgroup3<- SFOgroup[which(SFOgroup$GROUP == 3),]
SFOgroup23<- SFOgroup[which(SFOgroup$GROUP == 2|SFOgroup$GROUP == 3),]
SFOgroup12<- SFOgroup[which(SFOgroup$GROUP == 1|SFOgroup$GROUP == 2),]

a<-SFOgroup1$PASSTHRU
b<-SFOgroup2$PASSTHRU
c<-SFOgroup3$PASSTHRU
d<-SFOgroup23$PASSTHRU
e<-SFOgroup12$PASSTHRU
  

# Pre-2014 vs Post-2014 No PreCheck
wmw1<-wilcox.test(a,b, alternative="two.sided")
wmw1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  a and b
## W = 33256657, p-value = 2.811e-14
## alternative hypothesis: true location shift is not equal to 0
pval.wmw1<-wilcox.test(a,b, alternative="two.sided")$p.value
pval.wmw1
## [1] 2.811189e-14
# Post-2014 No PreCheck vs. Post-2014 PreCheck
wmw2<-wilcox.test(b,c, alternative="two.sided")
wmw2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b and c
## W = 11077248, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw2<-wilcox.test(b,c, alternative="two.sided")$p.value
pval.wmw2
## [1] 7.836384e-47
# No PreCheck vs PreCheck
wmw3<-wilcox.test(e,c, alternative="two.sided")
wmw3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  e and c
## W = 30972227, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw3<-wilcox.test(e,c, alternative="two.sided")$p.value
pval.wmw3
## [1] 6.251409e-98
# Pre-2014 vs Post-2014
wmw4<-wilcox.test(a,d, alternative="two.sided")
wmw4
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  a and d
## W = 53151636, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw4<-wilcox.test(a,d, alternative="two.sided")$p.value
pval.wmw4
## [1] 1.945769e-68