R Markdown

Set-Up/Data Cleaning

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(truncnorm)
## Warning: package 'truncnorm' was built under R version 4.1.2
library(extraDistr)
## Warning: package 'extraDistr' was built under R version 4.1.2
## 
## Attaching package: 'extraDistr'
## The following object is masked from 'package:purrr':
## 
##     rdunif
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 4.1.2
set.seed(406)

#setup
setwd("C:/Users/sweeneys/OneDrive - Michigan Medicine/Desktop/STATS 406/Project")
SFO <- read.csv("SFO Dataset.csv")
#SFOnew<- SFO[which(SFO$RATESFO >0 & SFO$RATESFO <6),]
SFOclean<- SFO[which(SFO$PASSTHRU >0 & SFO$PASSTHRU <6),]
SFOgroup<- SFOclean[which(SFOclean$GROUP > 0),]
SFOgroup1<- SFOgroup[which(SFOgroup$GROUP == 1),]

SFOpre<- SFO[which(SFO$POST == 0),]
SFOpost<- SFO[which(SFO$POST == 1),]

SFOGp1 <- SFOclean[which(SFOclean$GROUP == 1),]
SFOGp2 <- SFOclean[which(SFOclean$GROUP == 2),]
SFOGp3 <- SFOclean[which(SFOclean$GROUP == 3),]

Exploratory Data Analysis/Plots

library(ggplot2)

#hist(SFOGp1$PASSTHRU)
#hist(SFOGp2$PASSTHRU)
#hist(SFOGp3$PASSTHRU)

#counts<-table(SFOclean$PASSTHRU)
#barplot(counts)

counts2<-table(SFOgroup$GROUP, SFOgroup$PASSTHRU)
barplot(counts2, legend = rownames(counts2), beside=TRUE)

#ggplot() + geom_histogram(data=SFOclean, aes(x=PASSTHRU, y=..density..), binwidth = 1) + stat_function(fun = dnorm, args = list(mean = 5, sd = 0.9),col = "#1b98e0",size = 2)

#ggplot() + geom_histogram(data=SFOclean, aes(x=PASSTHRU, y=..density..), binwidth = 1) + stat_function(fun = dnorm, args = list(mean = 5, sd = 0.8),col = "#1b98e0",size = 2)


rate<-1:5
datpois<-dtpois(rate,lambda = 5, a=1, b=5)
plot(rate,datpois, type='b')

datpoisl4<-dtpois(rate,lambda = 4, a=1, b=5)


ggplot(data = SFOclean) + geom_histogram(aes(x = PASSTHRU, y = ..density..), binwidth = 1, bins = 5, col = "black", fill = "grey") + geom_point(aes(x=1, y=datpois[1]), color="red") + geom_point(aes(x=2, y=datpois[2]), color="red") + geom_point(aes(x=3, y=datpois[3]), color="red") + geom_point(aes(x=4, y=datpois[4]), color="red") + geom_point(aes(x=5, y=datpois[5]), color="red")

ggplot(data = SFOgroup1) + geom_histogram(aes(x = PASSTHRU, y = ..density..), binwidth = 1, bins = 5, col = "black", fill = "grey") + geom_point(aes(x=1, y=datpois[1]), color="red") + geom_point(aes(x=2, y=datpois[2]), color="red") + geom_point(aes(x=3, y=datpois[3]), color="red") + geom_point(aes(x=4, y=datpois[4]), color="red") + geom_point(aes(x=5, y=datpois[5]), color="red")

#hist(SFO$RATESFO)

boxplot(SFOgroup$RATESFO ~ SFOgroup$TERM)

boxplot(SFOgroup$RATESFO ~ SFOgroup$GENDER)

boxplot(SFOgroup$RATESFO ~ SFOgroup$AGE)

SFOtable<- table(SFOgroup$YEAR, SFOgroup$RATESFO)
barplot(SFOtable, main="Overall Satisfaction with SFO by year", legend.text=rownames(SFOtable))

#sat.year<-aov (RATESFO ~ YEAR, data=SFOnew)

#summary(sat.year)

#length(SFOnew$POST)
#length(SFOnew$RATESFO)

chisq.test(SFOnew$POST,SFOnew$RATESFO)

chisq.test(SFOpre$YEAR,SFOpre$RATESFO)
chisq.test(SFOpost$YEAR,SFOpost$RATESFO)

Simulation Distributions - RV Creation

B=1000
n=length(SFOgroup$GROUP)
p1<-data.frame(replicate(B,rtpois(B,5, a=1, b=5)))
p2<-data.frame(replicate(B,rtpois(B,5, a=1, b=5)))
p3<-data.frame(replicate(B,rtpois(B,4, a=1, b=5)))

#pt1<-data.frame(replicate(B,rt(B,df=1,ncp=5)))
#pt2<-data.frame(replicate(B,rt(B,df=1,ncp=5)))
#pt3<-data.frame(replicate(B,rt(B,df=1,ncp=3)))

ptn1<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)))
ptn2<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)))
ptn3<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=5, sd=1.3)))

Truncated Poisson Simulation for Power Function

null_samples<-p1
alt_samples<-p3

stat_1<-function(x){
  mean(x)
}

ts_0 <-map_dbl(null_samples, stat_1)
rr<-quantile(ts_0,0.95)
mean(ts_0>rr)
## [1] 0.048
ts_1 <-map_dbl(alt_samples,stat_1)
mean(ts_1>rr)
## [1] 0
lambda_power<-seq(1,5,length.out=20)

power_mom <- map_dbl(lambda_power,function(lambda){
    altmom<-data.frame(replicate(B, { rpois(B,lambda)}))
    ts_mom<-map_dbl(altmom,stat_1)
    mean(ts_mom>rr)
})

pp1<-data.frame(lambda_power,power_mom)
ggplot(data=pp1,aes(x=lambda_power,y=power_mom)) + geom_line() + geom_point()

Truncated Normal Simulation for Power Function

null_samples_tn<-ptn1
alt_samples_tn<-ptn3
stat_tn<-function(x){
  mean(x)
}

ts_0_tn <-map_dbl(null_samples_tn, stat_tn)
rr_tn<-quantile(ts_0_tn,0.95)
mean(ts_0_tn>rr_tn)
## [1] 0.05
ts_1_tn <-map_dbl(alt_samples_tn,stat_tn)
mean(ts_1_tn>rr_tn)
## [1] 1
tn_power<-seq(4,5,length.out=20)

power_tn_mean <- map_dbl(tn_power,function(t){
    alt<-data.frame(replicate(1000, {rtruncnorm(B, a=1, b=5, mean=t, sd=1.6)}))
    ts_tn<-map_dbl(alt,stat_tn)
    mean(ts_tn>rr_tn)
})

pptn<-data.frame(tn_power,power_tn_mean)
ggplot(data=pptn,aes(x=tn_power,y=power_tn_mean)) + geom_line() + geom_point()

Monte Carlo Simulations - Truncated Poisson and Truncated Normal

p1means<-map_dbl(p1,mean)
p2means<-map_dbl(p2,mean)
p3means<-map_dbl(p3,mean)


d1<-p1means-p2means
mean(d1)
## [1] -0.001156
quantile(d1,c(0.025,0.975))
##   2.5%  97.5% 
## -0.090  0.085
#rr1<-quantile(d1,0.95)
#mean(d1>rr1) # difference of mean - type 1 (one-sided)

d2<-p1means-p3means
mean(d2)
## [1] 0.244472
quantile(d2,c(0.025,0.975))
##  2.5% 97.5% 
## 0.152 0.338
#mean(d2>rr1) # difference of mean - power (one-sided)


rr2<-quantile(d1,0.975)
rr3<-quantile(d1,0.025)

mean(d1>rr2) + mean(d1<rr3)# difference of mean - type 1
## [1] 0.048
binom.test((sum(d1>rr2)+sum(d1<rr3)),B,conf.level = 0.95)$conf.int
## [1] 0.03560027 0.06314012
## attr(,"conf.level")
## [1] 0.95
mean(d2>rr2) + mean(d2<rr3) # difference of mean - power
## [1] 0.999
binom.test((sum(d2>rr2)+sum(d2<rr3)),B,conf.level = 0.95)$conf.int
## [1] 0.9944411 0.9999747
## attr(,"conf.level")
## [1] 0.95
pt.wmw<-function(n,m,lambda1, lambda2){
fdata<-rtpois(n,lambda1, a=1, b=5)
gdata<-rtpois(m,lambda2, a=1, b=5)
pval.wmw<-wilcox.test(fdata,gdata, alternative="two.sided")$p.value
if(pval.wmw<0.05) 1 else 0
}

pt1.wmw<-replicate(B,pt.wmw(1000,1000,5,5))
est.pti.wmw<-sum(pt1.wmw)/B
est.pti.wmw # wilcox - type 1
## [1] 0.054
binom.test(sum(pt1.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.04082335 0.06987401
## attr(,"conf.level")
## [1] 0.95
pt2.wmw<-replicate(B,pt.wmw(1000,1000,5,4))
est.pt2.wmw<-sum(pt2.wmw)/B
est.pt2.wmw  # wilcox - power
## [1] 1
binom.test(sum(pt2.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95
#TRUNCATED NORMAL

ptn1means<-map_dbl(ptn1,mean)
ptn2means<-map_dbl(ptn2,mean)
ptn3means<-map_dbl(ptn3,mean)

dtn1<-ptn1means-ptn2means
mean(dtn1)
## [1] 0.001409347
quantile(dtn1,c(0.025,0.975))
##        2.5%       97.5% 
## -0.07821950  0.08865552
#rrtn1<-quantile(dtn1,0.95)
#rrtn1
#mean(dtn1>rrtn1) # difference of mean - type 1 (one-sided)

dtn2<-ptn1means-ptn3means
mean(dtn2)
## [1] -0.3025945
quantile(dtn2,c(0.025,0.975))
##       2.5%      97.5% 
## -0.3785654 -0.2283354
#mean(dtn2>rrtn1) # difference of mean - power (one-sided)


rrtn2<-quantile(dtn1,0.975)
rrtn3<-quantile(dtn1,0.025)

mean(dtn1>rrtn2) + mean(dtn1<rrtn3)# difference of mean - type 1
## [1] 0.05
binom.test((sum(dtn1>rrtn2)+sum(dtn1<rrtn3)),B,conf.level = 0.95)$conf.int
## [1] 0.03733540 0.06539049
## attr(,"conf.level")
## [1] 0.95
mean(dtn2>rrtn2) + mean(dtn2<rrtn3) # difference of mean - power
## [1] 1
binom.test((sum(dtn2>rrtn2)+sum(dtn2<rrtn3)),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95
#rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)

tn.wmw<-function(n,m,mu1,mu2,sd1,sd2){
fdata<-rtruncnorm(n,a=1, b=5,mean = mu1, sd=sd1)
gdata<-rtruncnorm(m,a=1, b=5,mean = mu2, sd=sd2)
pval.wmw<-wilcox.test(fdata,gdata, alternative="two.sided")$p.value
if(pval.wmw<0.05) 1 else 0
}

ptn1.wmw<-replicate(B,tn.wmw(1000,1000,4.7,4.7,1.6,1.6))
est.ptni.wmw<-sum(ptn1.wmw)/B
est.ptni.wmw # wilcox - type 1
## [1] 0.039
binom.test(sum(ptn1.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.02787715 0.05293094
## attr(,"conf.level")
## [1] 0.95
ptn2.wmw<-replicate(B,tn.wmw(1000,1000,4.7,5,1.6,1.3))
est.ptn2.wmw<-sum(ptn2.wmw)/B
est.ptn2.wmw  # wilcox - power
## [1] 1
binom.test(sum(ptn2.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95

Descriptive Statistics

library(pastecs)
## Warning: package 'pastecs' was built under R version 4.1.2
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## The following object is masked from 'package:tidyr':
## 
##     extract
# TSAPRE
SFOgroup$TSAPRE <- as.factor(SFOgroup$TSAPRE)
summary(SFOgroup$TSAPRE)
##     1     2  NA's 
##  4513  6271 12222
tapply(SFOgroup$TSAPRE, SFOgroup$YEAR, function(x) format(summary(x)))
## $`2010`
##      1      2   NA's 
## "   0" "   0" "2808" 
## 
## $`2011`
##      1      2   NA's 
## "   0" "   0" "3354" 
## 
## $`2012`
##      1      2   NA's 
## "   0" "   0" "3004" 
## 
## $`2013`
##      1      2   NA's 
## "   0" "   0" "3056" 
## 
## $`2014`
##      1      2 
## " 779" "1333" 
## 
## $`2015`
##      1      2 
## " 973" "1241" 
## 
## $`2016`
##      1      2 
## " 919" "1373" 
## 
## $`2017`
##      1      2 
## " 906" "1183" 
## 
## $`2018`
##      1      2 
## " 936" "1141"
SFOpost<-SFOgroup[which(SFOgroup$YEAR > 2013),]

counts3<-table(SFOpost$YEAR, SFOpost$TSAPRE)
barplot(counts3, beside=TRUE, main="TSA Pre-Check Usage by Year, 2014-2018")

# PASSTHRU
summary(SFOgroup$PASSTHRU)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   4.000   4.131   5.000   5.000
tapply(SFOgroup$PASSTHRU, SFOgroup$GROUP, function(x) format(summary(x))) #https://stackoverflow.com/questions/9847054/how-to-get-summary-statistics-by-group
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "1.000" "3.000" "4.000" "4.013" "5.000" "5.000" 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "1.000" "4.000" "4.000" "4.153" "5.000" "5.000" 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "1.000" "4.000" "5.000" "4.422" "5.000" "5.000"
stat.desc(SFOgroup$PASSTHRU)
##      nbr.val     nbr.null       nbr.na          min          max        range 
## 2.300600e+04 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 4.000000e+00 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
## 9.504700e+04 4.000000e+00 4.131401e+00 6.820962e-03 1.336954e-02 1.070366e+00 
##      std.dev     coef.var 
## 1.034585e+00 2.504199e-01
tapply(SFOgroup$PASSTHRU, SFOgroup$GROUP, function(x) format(stat.desc(x)))
## $`1`
##        nbr.val       nbr.null         nbr.na            min            max 
## "1.222200e+04" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00" 
##          range            sum         median           mean        SE.mean 
## "4.000000e+00" "4.904800e+04" "4.000000e+00" "4.013091e+00" "9.858877e-03" 
##   CI.mean.0.95            var        std.dev       coef.var 
## "1.932496e-02" "1.187947e+00" "1.089930e+00" "2.715936e-01" 
## 
## $`2`
##        nbr.val       nbr.null         nbr.na            min            max 
## "6.271000e+03" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00" 
##          range            sum         median           mean        SE.mean 
## "4.000000e+00" "2.604100e+04" "4.000000e+00" "4.152607e+00" "1.266507e-02" 
##   CI.mean.0.95            var        std.dev       coef.var 
## "2.482788e-02" "1.005894e+00" "1.002943e+00" "2.415212e-01" 
## 
## $`3`
##        nbr.val       nbr.null         nbr.na            min            max 
## "4.513000e+03" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00" 
##          range            sum         median           mean        SE.mean 
## "4.000000e+00" "1.995800e+04" "5.000000e+00" "4.422335e+00" "1.261997e-02" 
##   CI.mean.0.95            var        std.dev       coef.var 
## "2.474132e-02" "7.187563e-01" "8.477950e-01" "1.917075e-01"
counts2<-table(SFOgroup$GROUP, SFOgroup$PASSTHRU)
barplot(counts2, legend = c("Pre-2014","Post-2014,No Precheck","Post-2014,Precheck"),args.legend = list(x ='topleft', bty='n'), beside=TRUE,main="Ease of Passing through Security Ratings")

#RATE SFO
summary(SFOgroup$RATESFO)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   4.000   4.000   3.878   4.000   6.000
tapply(SFOgroup$RATESFO, SFOgroup$GROUP, function(x) format(summary(x))) 
## $`1`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "0.000" "4.000" "4.000" "3.866" "4.000" "6.000" 
## 
## $`2`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "0.000" "4.000" "4.000" "3.899" "4.000" "6.000" 
## 
## $`3`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## "0.000" "4.000" "4.000" "3.883" "4.000" "6.000"
#ALL
dim(SFOgroup)
## [1] 23006    60
SFOgroup$YEAR<-as.factor(SFOgroup$YEAR)
SFOgroup$GROUP<-as.factor(SFOgroup$GROUP)
table(SFOgroup$YEAR, SFOgroup$GROUP)
##       
##           1    2    3
##   2010 2808    0    0
##   2011 3354    0    0
##   2012 3004    0    0
##   2013 3056    0    0
##   2014    0 1333  779
##   2015    0 1241  973
##   2016    0 1373  919
##   2017    0 1183  906
##   2018    0 1141  936

Wilcoxon (WMW) Pairwise Comparisons

SFOgroup1<- SFOgroup[which(SFOgroup$GROUP == 1),]
SFOgroup2<- SFOgroup[which(SFOgroup$GROUP == 2),]
SFOgroup3<- SFOgroup[which(SFOgroup$GROUP == 3),]
SFOgroup23<- SFOgroup[which(SFOgroup$GROUP == 2|SFOgroup$GROUP == 3),]
SFOgroup12<- SFOgroup[which(SFOgroup$GROUP == 1|SFOgroup$GROUP == 2),]

a<-SFOgroup1$PASSTHRU # Pre-2014
b<-SFOgroup2$PASSTHRU # Post-2014 no PreCheck
c<-SFOgroup3$PASSTHRU # Post-2014 PreCheck
d<-SFOgroup23$PASSTHRU # All Post-2014
e<-SFOgroup12$PASSTHRU # All no PreCheck
  

# Pre-2014 vs Post-2014 No PreCheck
wmw1<-wilcox.test(a,b, alternative="two.sided")
wmw1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  a and b
## W = 35815930, p-value = 7.839e-15
## alternative hypothesis: true location shift is not equal to 0
pval.wmw1<-wilcox.test(a,b, alternative="two.sided")$p.value
pval.wmw1
## [1] 7.838684e-15
# Post-2014 No PreCheck vs. Post-2014 PreCheck
wmw2<-wilcox.test(b,c, alternative="two.sided")
wmw2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b and c
## W = 12055506, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw2<-wilcox.test(b,c, alternative="two.sided")$p.value
pval.wmw2
## [1] 2.284942e-47
# No PreCheck vs PreCheck
wmw3<-wilcox.test(e,c, alternative="two.sided")
wmw3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  e and c
## W = 33828901, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw3<-wilcox.test(e,c, alternative="two.sided")$p.value
pval.wmw3
## [1] 2.909615e-100
# Pre-2014 vs Post-2014
wmw4<-wilcox.test(a,d, alternative="two.sided")
wmw4
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  a and d
## W = 57589326, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw4<-wilcox.test(a,d, alternative="two.sided")$p.value
pval.wmw4
## [1] 7.793181e-71

Monte Carlo Pairwise Comparisons

#a<-SFOgroup1$PASSTHRU # Pre-2014
#b<-SFOgroup2$PASSTHRU # Post-2014 no PreCheck
#c<-SFOgroup3$PASSTHRU # Post-2014 PreCheck
#d<-SFOgroup23$PASSTHRU # All Post-2014
#e<-SFOgroup12$PASSTHRU # All no PreCheck


#d2<-p1means-p3means
#mean(dtn2>rrtn2) + mean(dtn2<rrtn3) # difference of mean - power
#binom.test((sum(dtn2>rrtn2)+sum(dtn2<rrtn3)),B,conf.level = 0.95)$conf.int

ma<-mean(a)
mb<-mean(b)
mc<-mean(c)
md<-mean(d)
me<-mean(e)

mab<-ma-mb 
mab # Pre-2014 vs Post-2014 No PreCheck
## [1] -0.1395161
mbc<-mb-mc
mbc # Post-2014 No PreCheck vs. Post-2014 PreCheck
## [1] -0.2697282
mec<-me-mc
mec # No PreCheck vs PreCheck
## [1] -0.3619342
mad<-ma-md
mad # Pre-2014 vs Post-2014
## [1] -0.2523948
# Pre-2014 vs Post-2014 No PreCheck
pless1<-mean(dtn1 <= mab)
pgreater1<-mean(dtn1 >= mab)
(pvalue1 <- 2* min(pless1,pgreater1))
## [1] 0
# Post-2014 No PreCheck vs. Post-2014 PreCheck
pless2<-mean(dtn1 <= mbc)
pgreater2<-mean(dtn1 >= mbc)
(pvalue2 <- 2* min(pless2,pgreater2))
## [1] 0
# No PreCheck vs PreCheck
pless3<-mean(dtn1 <= mec)
pgreater3<-mean(dtn1 >= mec)
(pvalue3 <- 2* min(pless3,pgreater3))
## [1] 0
# Pre-2014 vs Post-2014
pless4<-mean(dtn1 <= mad)
pgreater4<-mean(dtn1 >= mad)
(pvalue4 <- 2* min(pless4,pgreater4))
## [1] 0