R Markdown
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(truncnorm)
## Warning: package 'truncnorm' was built under R version 4.1.2
library(extraDistr)
## Warning: package 'extraDistr' was built under R version 4.1.2
##
## Attaching package: 'extraDistr'
## The following object is masked from 'package:purrr':
##
## rdunif
library(Rcpp)
## Warning: package 'Rcpp' was built under R version 4.1.2
#setup
setwd("C:/Users/sweeneys/OneDrive - Michigan Medicine/Desktop/STATS 406/Project")
SFO <- read.csv("SFO Dataset.csv")
#SFOnew<- SFO[which(SFO$RATESFO >0 & SFO$RATESFO <6),]
SFOclean<- SFO[which(SFO$PASSTHRU >0 & SFO$PASSTHRU <6),]
SFOgroup<- SFOclean[which(SFOclean$GROUP > 0),]
SFOgroup1<- SFOgroup[which(SFOgroup$GROUP == 1),]
SFOpre<- SFO[which(SFO$POST == 0),]
SFOpost<- SFO[which(SFO$POST == 1),]
SFOGp1 <- SFOclean[which(SFOclean$GROUP == 1),]
SFOGp2 <- SFOclean[which(SFOclean$GROUP == 2),]
SFOGp3 <- SFOclean[which(SFOclean$GROUP == 3),]
library(ggplot2)
#hist(SFOGp1$PASSTHRU)
#hist(SFOGp2$PASSTHRU)
#hist(SFOGp3$PASSTHRU)
#counts<-table(SFOclean$PASSTHRU)
#barplot(counts)
counts2<-table(SFOgroup$GROUP, SFOgroup$PASSTHRU)
barplot(counts2, legend = rownames(counts2), beside=TRUE)

#ggplot() + geom_histogram(data=SFOclean, aes(x=PASSTHRU, y=..density..), binwidth = 1) + stat_function(fun = dnorm, args = list(mean = 5, sd = 0.9),col = "#1b98e0",size = 2)
#ggplot() + geom_histogram(data=SFOclean, aes(x=PASSTHRU, y=..density..), binwidth = 1) + stat_function(fun = dnorm, args = list(mean = 5, sd = 0.8),col = "#1b98e0",size = 2)
rate<-1:5
datpois<-dtpois(rate,lambda = 5, a=1, b=5)
plot(rate,datpois, type='b')

datpoisl4<-dtpois(rate,lambda = 4, a=1, b=5)
ggplot(data = SFOclean) + geom_histogram(aes(x = PASSTHRU, y = ..density..), binwidth = 1, bins = 5, col = "black", fill = "grey") + geom_point(aes(x=1, y=datpois[1]), color="red") + geom_point(aes(x=2, y=datpois[2]), color="red") + geom_point(aes(x=3, y=datpois[3]), color="red") + geom_point(aes(x=4, y=datpois[4]), color="red") + geom_point(aes(x=5, y=datpois[5]), color="red")

ggplot(data = SFOgroup1) + geom_histogram(aes(x = PASSTHRU, y = ..density..), binwidth = 1, bins = 5, col = "black", fill = "grey") + geom_point(aes(x=1, y=datpois[1]), color="red") + geom_point(aes(x=2, y=datpois[2]), color="red") + geom_point(aes(x=3, y=datpois[3]), color="red") + geom_point(aes(x=4, y=datpois[4]), color="red") + geom_point(aes(x=5, y=datpois[5]), color="red")

#hist(SFO$RATESFO)
boxplot(SFOnew$RATESFO ~ SFOnew$TERM)

boxplot(SFOnew$RATESFO ~ SFOnew$GENDER)

boxplot(SFOnew$RATESFO ~ SFOnew$AGE)

SFOtable<- table(SFOnew$YEAR, SFOnew$RATESFO)
barplot(SFOtable, main="Overall Satisfaction with SFO by year", legend.text=rownames(SFOtable))

#sat.year<-aov (RATESFO ~ YEAR, data=SFOnew)
#summary(sat.year)
#length(SFOnew$POST)
#length(SFOnew$RATESFO)
chisq.test(SFOnew$POST,SFOnew$RATESFO)
chisq.test(SFOpre$YEAR,SFOpre$RATESFO)
chisq.test(SFOpost$YEAR,SFOpost$RATESFO)
SIMULATION DISTRIBUTION - RV CREATION
B=1000
n=length(SFOgroup$GROUP)
p1<-data.frame(replicate(B,rtpois(B,5, a=1, b=5)))
p2<-data.frame(replicate(B,rtpois(B,5, a=1, b=5)))
p3<-data.frame(replicate(B,rtpois(B,4, a=1, b=5)))
#pt1<-data.frame(replicate(B,rt(B,df=1,ncp=5)))
#pt2<-data.frame(replicate(B,rt(B,df=1,ncp=5)))
#pt3<-data.frame(replicate(B,rt(B,df=1,ncp=3)))
ptn1<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)))
ptn2<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)))
ptn3<-data.frame(replicate(B,rtruncnorm(B, a=1, b=5, mean=5, sd=1.3)))
null_samples<-p1
alt_samples<-p3
stat_1<-function(x){
mean(x)
}
ts_0 <-map_dbl(null_samples, stat_1)
rr<-quantile(ts_0,0.95)
mean(ts_0>rr)
## [1] 0.05
ts_1 <-map_dbl(alt_samples,stat_1)
mean(ts_1>rr)
## [1] 0
lambda_power<-seq(1,5,length.out=20)
power_mom <- map_dbl(lambda_power,function(lambda){
altmom<-data.frame(replicate(B, { rpois(B,lambda)}))
ts_mom<-map_dbl(altmom,stat_1)
mean(ts_mom>rr)
})
pp1<-data.frame(lambda_power,power_mom)
ggplot(data=pp1,aes(x=lambda_power,y=power_mom)) + geom_line() + geom_point()

null_samples_tn<-ptn1
alt_samples_tn<-ptn3
stat_tn<-function(x){
mean(x)
}
ts_0_tn <-map_dbl(null_samples_tn, stat_tn)
rr_tn<-quantile(ts_0_tn,0.95)
mean(ts_0_tn>rr_tn)
## [1] 0.05
ts_1_tn <-map_dbl(alt_samples_tn,stat_tn)
mean(ts_1_tn>rr_tn)
## [1] 1
tn_power<-seq(4,5,length.out=20)
power_tn_mean <- map_dbl(tn_power,function(t){
alt<-data.frame(replicate(1000, {rtruncnorm(B, a=1, b=5, mean=t, sd=1.6)}))
ts_tn<-map_dbl(alt,stat_tn)
mean(ts_tn>rr_tn)
})
pptn<-data.frame(tn_power,power_tn_mean)
ggplot(data=pptn,aes(x=tn_power,y=power_tn_mean)) + geom_line() + geom_point()

MONTE CARLO SIMULATIONS
p1means<-map_dbl(p1,mean)
p2means<-map_dbl(p2,mean)
p3means<-map_dbl(p3,mean)
d1<-p1means-p2means
mean(d1)
## [1] -0.002102
quantile(d1,c(0.025,0.975))
## 2.5% 97.5%
## -0.092025 0.089000
#rr1<-quantile(d1,0.95)
#mean(d1>rr1) # difference of mean - type 1 (one-sided)
d2<-p1means-p3means
mean(d2)
## [1] 0.247398
quantile(d2,c(0.025,0.975))
## 2.5% 97.5%
## 0.152975 0.338025
#mean(d2>rr1) # difference of mean - power (one-sided)
rr2<-quantile(d1,0.975)
rr3<-quantile(d1,0.025)
mean(d1>rr2) + mean(d1<rr3)# difference of mean - type 1
## [1] 0.049
binom.test((sum(d1>rr2)+sum(d1<rr3)),B,conf.level = 0.95)$conf.int
## [1] 0.03646706 0.06426605
## attr(,"conf.level")
## [1] 0.95
mean(d2>rr2) + mean(d2<rr3) # difference of mean - power
## [1] 0.999
binom.test((sum(d2>rr2)+sum(d2<rr3)),B,conf.level = 0.95)$conf.int
## [1] 0.9944411 0.9999747
## attr(,"conf.level")
## [1] 0.95
pt.wmw<-function(n,m,lambda1, lambda2){
fdata<-rtpois(n,lambda1, a=1, b=5)
gdata<-rtpois(m,lambda2, a=1, b=5)
pval.wmw<-wilcox.test(fdata,gdata, alternative="two.sided")$p.value
if(pval.wmw<0.05) 1 else 0
}
pt1.wmw<-replicate(B,pt.wmw(1000,1000,5,5))
est.pti.wmw<-sum(pt1.wmw)/B
est.pti.wmw # wilcox - type 1
## [1] 0.047
binom.test(sum(pt1.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.03473506 0.06201263
## attr(,"conf.level")
## [1] 0.95
pt2.wmw<-replicate(B,pt.wmw(1000,1000,5,4))
est.pt2.wmw<-sum(pt2.wmw)/B
est.pt2.wmw # wilcox - power
## [1] 0.999
binom.test(sum(pt2.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.9944411 0.9999747
## attr(,"conf.level")
## [1] 0.95
#TRUNCATED NORMAL
ptn1means<-map_dbl(ptn1,mean)
ptn2means<-map_dbl(ptn2,mean)
ptn3means<-map_dbl(ptn3,mean)
dtn1<-ptn1means-ptn2means
mean(dtn1)
## [1] -0.0004584533
quantile(dtn1,c(0.025,0.975))
## 2.5% 97.5%
## -0.08351339 0.07877126
#rrtn1<-quantile(dtn1,0.95)
#rrtn1
#mean(dtn1>rrtn1) # difference of mean - type 1 (one-sided)
dtn2<-ptn1means-ptn3means
mean(dtn2)
## [1] -0.3035379
quantile(dtn2,c(0.025,0.975))
## 2.5% 97.5%
## -0.3776802 -0.2298890
#mean(dtn2>rrtn1) # difference of mean - power (one-sided)
rrtn2<-quantile(dtn1,0.975)
rrtn3<-quantile(dtn1,0.025)
mean(dtn1>rrtn2) + mean(dtn1<rrtn3)# difference of mean - type 1
## [1] 0.05
binom.test((sum(dtn1>rrtn2)+sum(dtn1<rrtn3)),B,conf.level = 0.95)$conf.int
## [1] 0.03733540 0.06539049
## attr(,"conf.level")
## [1] 0.95
mean(dtn2>rrtn2) + mean(dtn2<rrtn3) # difference of mean - power
## [1] 1
binom.test((sum(dtn2>rrtn2)+sum(dtn2<rrtn3)),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95
#rtruncnorm(B, a=1, b=5, mean=4.7, sd=1.6)
tn.wmw<-function(n,m,mu1,mu2,sd1,sd2){
fdata<-rtruncnorm(n,a=1, b=5,mean = mu1, sd=sd1)
gdata<-rtruncnorm(m,a=1, b=5,mean = mu2, sd=sd2)
pval.wmw<-wilcox.test(fdata,gdata, alternative="two.sided")$p.value
if(pval.wmw<0.05) 1 else 0
}
ptn1.wmw<-replicate(B,tn.wmw(1000,1000,4.7,4.7,1.6,1.6))
est.ptni.wmw<-sum(ptn1.wmw)/B
est.ptni.wmw # wilcox - type 1
## [1] 0.049
binom.test(sum(ptn1.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.03646706 0.06426605
## attr(,"conf.level")
## [1] 0.95
ptn2.wmw<-replicate(B,tn.wmw(1000,1000,4.7,5,1.6,1.3))
est.ptn2.wmw<-sum(ptn2.wmw)/B
est.ptn2.wmw # wilcox - power
## [1] 1
binom.test(sum(ptn2.wmw),B,conf.level = 0.95)$conf.int
## [1] 0.9963179 1.0000000
## attr(,"conf.level")
## [1] 0.95
DESCRIPTIVE STATISTICS
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.1.2
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:tidyr':
##
## extract
# TSAPRE
SFOgroup$TSAPRE <- as.factor(SFOgroup$TSAPRE)
summary(SFOgroup$TSAPRE)
## 1 2 NA's
## 4513 6271 12222
tapply(SFOgroup$TSAPRE, SFOgroup$YEAR, function(x) format(summary(x)))
## $`2010`
## 1 2 NA's
## " 0" " 0" "2808"
##
## $`2011`
## 1 2 NA's
## " 0" " 0" "3354"
##
## $`2012`
## 1 2 NA's
## " 0" " 0" "3004"
##
## $`2013`
## 1 2 NA's
## " 0" " 0" "3056"
##
## $`2014`
## 1 2
## " 779" "1333"
##
## $`2015`
## 1 2
## " 973" "1241"
##
## $`2016`
## 1 2
## " 919" "1373"
##
## $`2017`
## 1 2
## " 906" "1183"
##
## $`2018`
## 1 2
## " 936" "1141"
SFOpost<-SFOgroup[which(SFOgroup$YEAR > 2013),]
counts3<-table(SFOpost$YEAR, SFOpost$TSAPRE)
barplot(counts3, beside=TRUE, main="TSA Pre-Check Usage by Year, 2014-2018")

# PASSTHRU
summary(SFOgroup$PASSTHRU)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 4.000 4.131 5.000 5.000
tapply(SFOgroup$PASSTHRU, SFOgroup$GROUP, function(x) format(summary(x))) #https://stackoverflow.com/questions/9847054/how-to-get-summary-statistics-by-group
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1.000" "3.000" "4.000" "4.013" "5.000" "5.000"
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1.000" "4.000" "4.000" "4.153" "5.000" "5.000"
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1.000" "4.000" "5.000" "4.422" "5.000" "5.000"
stat.desc(SFOgroup$PASSTHRU)
## nbr.val nbr.null nbr.na min max range
## 2.300600e+04 0.000000e+00 0.000000e+00 1.000000e+00 5.000000e+00 4.000000e+00
## sum median mean SE.mean CI.mean.0.95 var
## 9.504700e+04 4.000000e+00 4.131401e+00 6.820962e-03 1.336954e-02 1.070366e+00
## std.dev coef.var
## 1.034585e+00 2.504199e-01
tapply(SFOgroup$PASSTHRU, SFOgroup$GROUP, function(x) format(stat.desc(x)))
## $`1`
## nbr.val nbr.null nbr.na min max
## "1.222200e+04" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00"
## range sum median mean SE.mean
## "4.000000e+00" "4.904800e+04" "4.000000e+00" "4.013091e+00" "9.858877e-03"
## CI.mean.0.95 var std.dev coef.var
## "1.932496e-02" "1.187947e+00" "1.089930e+00" "2.715936e-01"
##
## $`2`
## nbr.val nbr.null nbr.na min max
## "6.271000e+03" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00"
## range sum median mean SE.mean
## "4.000000e+00" "2.604100e+04" "4.000000e+00" "4.152607e+00" "1.266507e-02"
## CI.mean.0.95 var std.dev coef.var
## "2.482788e-02" "1.005894e+00" "1.002943e+00" "2.415212e-01"
##
## $`3`
## nbr.val nbr.null nbr.na min max
## "4.513000e+03" "0.000000e+00" "0.000000e+00" "1.000000e+00" "5.000000e+00"
## range sum median mean SE.mean
## "4.000000e+00" "1.995800e+04" "5.000000e+00" "4.422335e+00" "1.261997e-02"
## CI.mean.0.95 var std.dev coef.var
## "2.474132e-02" "7.187563e-01" "8.477950e-01" "1.917075e-01"
counts2<-table(SFOgroup$GROUP, SFOgroup$PASSTHRU)
barplot(counts2, legend = c("Pre-2014","Post-2014,No Precheck","Post-2014,Precheck"),args.legend = list(x ='topleft', bty='n'), beside=TRUE,main="Ease of Passing through Security Ratings")

#RATE SFO
summary(SFOgroup$RATESFO)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 4.000 4.000 3.878 4.000 6.000
tapply(SFOgroup$RATESFO, SFOgroup$GROUP, function(x) format(summary(x)))
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "0.000" "4.000" "4.000" "3.866" "4.000" "6.000"
##
## $`2`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "0.000" "4.000" "4.000" "3.899" "4.000" "6.000"
##
## $`3`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "0.000" "4.000" "4.000" "3.883" "4.000" "6.000"
#ALL
dim(SFOgroup)
## [1] 23006 60
SFOgroup$YEAR<-as.factor(SFOgroup$YEAR)
SFOgroup$GROUP<-as.factor(SFOgroup$GROUP)
table(SFOgroup$YEAR, SFOgroup$GROUP)
##
## 1 2 3
## 2010 2808 0 0
## 2011 3354 0 0
## 2012 3004 0 0
## 2013 3056 0 0
## 2014 0 1333 779
## 2015 0 1241 973
## 2016 0 1373 919
## 2017 0 1183 906
## 2018 0 1141 936
WILCOXON PAIRWISE COMPARISONS
SFOgroup1<- SFOgroup[which(SFOgroup$GROUP == 1),]
SFOgroup2<- SFOgroup[which(SFOgroup$GROUP == 2),]
SFOgroup3<- SFOgroup[which(SFOgroup$GROUP == 3),]
SFOgroup23<- SFOgroup[which(SFOgroup$GROUP == 2|SFOgroup$GROUP == 3),]
SFOgroup12<- SFOgroup[which(SFOgroup$GROUP == 1|SFOgroup$GROUP == 2),]
a<-SFOgroup1$PASSTHRU
b<-SFOgroup2$PASSTHRU
c<-SFOgroup3$PASSTHRU
d<-SFOgroup23$PASSTHRU
e<-SFOgroup12$PASSTHRU
# Pre-2014 vs Post-2014 No PreCheck
wmw1<-wilcox.test(a,b, alternative="two.sided")
wmw1
##
## Wilcoxon rank sum test with continuity correction
##
## data: a and b
## W = 33256657, p-value = 2.811e-14
## alternative hypothesis: true location shift is not equal to 0
pval.wmw1<-wilcox.test(a,b, alternative="two.sided")$p.value
pval.wmw1
## [1] 2.811189e-14
# Post-2014 No PreCheck vs. Post-2014 PreCheck
wmw2<-wilcox.test(b,c, alternative="two.sided")
wmw2
##
## Wilcoxon rank sum test with continuity correction
##
## data: b and c
## W = 11077248, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw2<-wilcox.test(b,c, alternative="two.sided")$p.value
pval.wmw2
## [1] 7.836384e-47
# No PreCheck vs PreCheck
wmw3<-wilcox.test(e,c, alternative="two.sided")
wmw3
##
## Wilcoxon rank sum test with continuity correction
##
## data: e and c
## W = 30972227, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw3<-wilcox.test(e,c, alternative="two.sided")$p.value
pval.wmw3
## [1] 6.251409e-98
# Pre-2014 vs Post-2014
wmw4<-wilcox.test(a,d, alternative="two.sided")
wmw4
##
## Wilcoxon rank sum test with continuity correction
##
## data: a and d
## W = 53151636, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
pval.wmw4<-wilcox.test(a,d, alternative="two.sided")$p.value
pval.wmw4
## [1] 1.945769e-68