data <- d2 %>%
select(-c("force2", "pct")) %>%
mutate_at(vars(all_of(yes_no_columns)), ~ifelse(. == "N", 0, 1))%>%
mutate(race2 = case_when(
race2 == "white" ~ 0,
race2 == "black" ~ 1,
race2 == "hisp" ~ 2,
race2 == "asian" ~ 3,
race2 == "other" ~ 4
)) %>%
mutate(typeofid2 = case_when(
typeofid2 == "O" ~ 0,
typeofid2 == "P" ~ 1,
typeofid2 == "R" ~ 2,
typeofid2 == "V" ~ 3,
)) %>%
mutate(across(all_of(factor_columns), as.factor)) %>%
mutate(year = as.numeric(year))
summary(data)
## force race2 gender age2 daytime
## 0 :3910239 0 : 492688 0 :4495436 Min. :10 0 :2472074
## 1 : 736051 1 :2886843 1 : 348159 1st Qu.:19 1 :1871287
## 3 : 156732 2 :1215401 NA's: 140796 Median :24 NA's: 641030
## 2 : 122898 3 : 152053 Mean :28
## 5 : 31590 4 : 235105 3rd Qu.:34
## 6 : 17769 NA's: 2301 Max. :90
## (Other): 9112 NA's :41534
## inout2 ac_incid ac_time offunif2 typeofid2
## 0 :3809789 0:2204888 0:3158281 0 :1396968 0 : 84217
## 1 :1147150 1:2779503 1:1826110 1 :3586909 1 :2640092
## NA's: 27452 NA's: 514 2 : 107497
## 3 :2133368
## NA's: 19217
##
##
## othpers2 cs_objcs cs_descr cs_casng cs_lkout cs_cloth
## 0 :3806808 0:4850690 0:4116587 0:3562617 0:4151151 0:4773826
## 1 :1163298 1: 133701 1: 867804 1:1421774 1: 833240 1: 210565
## NA's: 14285
##
##
##
##
## cs_drgtr cs_furtv cs_vcrim cs_bulge cs_other year
## 0:4522192 0:2801105 0:4588325 0:4543022 0:3944522 Min. :2003
## 1: 462199 1:2183286 1: 396066 1: 441369 1:1039869 1st Qu.:2006
## Median :2009
## Mean :2008
## 3rd Qu.:2011
## Max. :2013
##
Histograms of all the covariates except age and year which are seen below.
We make a Spearman correlation for all the covariates except for the two we discarded.
cp <- cor(data.matrix(na.omit(data)), method = "spearman")
ord <- rev(hclust(as.dist(1-abs(cp)))$order)
colPal <- colorRampPalette(c("white", "darkseagreen4"), space = "rgb")(100)
spearman <- levelplot(cp[ord, ord], xlab = "", ylab = "",
col.regions = colPal, at = seq(-1, 1, length.out = 100),
colorkey = list(space = "top", labels = list(cex = 0.5)),
scales = list(x = list(rot = 45),
y = list(rot = 45),
cex = 0.5))
spearman
We now define a new data set where force id 0 if there is no force, and 1 if there is any kind of force. This is because we later want to fit a binomial model.
table(data$force, data$year)
##
## 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
## 0 119564 246398 312773 406541 365744 414700 439271 463337 540799 443857
## 1 21225 41392 56400 69177 75406 91045 103625 97624 102471 57803
## 2 8134 10412 12542 653 13097 14096 16467 16444 16978 10099
## 3 7938 10637 11812 11366 14079 16265 17690 19265 21076 17697
## 4 912 1010 937 631 612 496 481 498 641 712
## 5 1166 1579 1684 16069 1369 1792 1831 2128 1845 1418
## 6 1787 1954 1808 1868 1592 1747 1609 1807 1730 1215
## 7 125 141 235 184 197 161 194 182 184 110
##
## 2013
## 0 157255
## 1 19883
## 2 3976
## 3 8907
## 4 435
## 5 709
## 6 652
## 7 34
Not many force 4-observations. maybe it’s not a big difference for the police to go from “4: at least drawing a weapon on a civilian.” to “5: at least pushing a civilian to the ground.” Which gives reason for the choice of joining some “forces” and fitting a binomial model.
How can we replace the missing values? Maybe we should do this only for the columns we are interested in and not all of them.
Diagnostistke NA-plot på hele datasættet??
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## Loading required package: colorspace
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, were retired in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
##
## Attaching package: 'ggmice'
## The following objects are masked from 'package:mice':
##
## bwplot, densityplot, stripplot, xyplot
## The following objects are masked from 'package:lattice':
##
## bwplot, densityplot, stripplot, xyplot
NAplot <- md.pattern(modeldata,
rotate.names = TRUE)
pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(data,2,pMiss)
## force race2 gender age2 daytime inout2
## 0.00000000 0.04616412 2.82473827 0.83328134 12.86074869 0.55075936
## ac_incid ac_time offunif2 typeofid2 othpers2 cs_objcs
## 0.00000000 0.00000000 0.01031219 0.38554359 0.28659469 0.00000000
## cs_descr cs_casng cs_lkout cs_cloth cs_drgtr cs_furtv
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## cs_vcrim cs_bulge cs_other year
## 0.00000000 0.00000000 0.00000000 0.00000000
#apply(modeldata,1,pMiss)
aggr_plot <- aggr(data, col=c('cadetblue','darkseagreen2'),
numbers=TRUE,
sortVars=TRUE,
labels=names(modeldata),
cex.axis=.7, gap=3,
ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## ac_incid 0.1286074869
## daytime 0.0282473827
## inout2 0.0083328134
## offunif2 0.0055075936
## age2 0.0038554359
## daytime 0.0028659469
## age2 0.0004616412
## force 0.0001031219
## force 0.0000000000
## cs_furtv 0.0000000000
## cs_lkout 0.0000000000
## inout2 0.0000000000
## ac_incid 0.0000000000
## offunif2 0.0000000000
## cs_furtv 0.0000000000
## cs_lkout 0.0000000000
## force 0.0000000000
## age2 0.0000000000
## daytime 0.0000000000
## inout2 0.0000000000
## ac_incid 0.0000000000
## offunif2 0.0000000000
We dont see a strong pattern of how the values are missing.
12% of data is missing “daytime”. We investigate how the data is missing:
We suspect some correlation between daytime an ac_time stratified on force.
not a strong correlation, but this may indicate, that the values are missing at random
Cross tabulation:
cross_tab <- table(data$year, data$daytime, useNA = "ifany")
row_sums <- rowSums(cross_tab, na.rm = TRUE)
cross_tab_with_sums <- cbind(cross_tab, row_sums)
print(cross_tab_with_sums)
## 0 1 <NA> row_sums
## 2003 100148 60686 17 160851
## 2004 201078 112376 69 313523
## 2005 250292 147348 551 398191
## 2006 243565 185416 77508 506489
## 2007 232487 162968 76641 472096
## 2008 261735 191513 87054 540302
## 2009 266595 228727 85846 581168
## 2010 278109 225644 97532 601285
## 2011 307707 275828 102189 685724
## 2012 242830 208138 81943 532911
## 2013 87528 72643 31680 191851
From the cross tabulation we see that in year 2003, 2004 and 2005 there are not many missing values ratio between day and night for these years are quite similar to the ratio in the following years when not taking the NA’s into account. This indicates once more, that the values are missing at random.
modeldata_imp <- mice(modeldata,m=5,maxit=50,meth='pmm',seed=500)
summary(modeldata_imp)
#watch imputed data
modeldata_imp$imp$daytime
#puts imputed data from 1st bootstrap back into dataset
modeldata_mice1 <- complete(modeldata_imp, 1)
modeldata_mice2 <- complete(modeldata_imp, 2)
modeldata_mice3 <- complete(modeldata_imp, 3)
modeldata_mice4 <- complete(modeldata_imp, 4)
modeldata_mice5 <- complete(modeldata_imp, 5)
Histogram of imputed data
df_orig <- data.frame(mice1$daytime, mice2$daytime, mice3$daytime, mice4$daytime, mice5$daytime, modeldata_orig$daytime)
df_orig_long <- df_orig %>%
pivot_longer(everything(), names_to = "Dataset", values_to = "BinaryValue")
# Create a histogram with
p_orig <- ggplot(df_orig_long, aes(x = BinaryValue, fill = Dataset)) +
geom_bar(position = "dodge") +
scale_fill_manual(values = c("mice1.daytime" = "darkseagreen", "mice2.daytime" = "darkseagreen1", "mice3.daytime" = "darkseagreen4", "mice4.daytime" = "darkseagreen3", "mice5.daytime" = "darkseagreen2", "modeldata.daytime" = "cadetblue3")) +
labs(fill = "Dataset") +
theme(legend.position = "top", text = element_text(size =7))
df_mice <- data.frame(mice1$daytime, mice2$daytime, mice3$daytime, mice4$daytime, mice5$daytime)
df_mice_long <- df_mice %>%
pivot_longer(everything(), names_to = "Dataset", values_to = "BinaryValue")
# Create a histogram without original in comparison
p_mice <- ggplot(df_mice_long, aes(x = BinaryValue, fill = Dataset)) +
geom_bar(position = "dodge") +
scale_fill_manual(values = c("mice1.daytime" = "darkseagreen", "mice2.daytime" = "darkseagreen1", "mice3.daytime" = "darkseagreen4", "mice4.daytime" = "darkseagreen3", "mice5.daytime" = "darkseagreen2")) +
labs(fill = "Dataset") +
theme(legend.position = "top", text = element_text(size =7))
grid.arrange(p_orig, p_mice, ncol = 2)
## Warning: Removed 641030 rows containing non-finite values (`stat_count()`).
Remember to import data!
Estimate reconcilation
mice1$dataset <- "Dataset 1"
mice2$dataset <- "Dataset 2"
mice3$dataset <- "Dataset 3"
mice4$dataset <- "Dataset 4"
mice5$dataset <- "Dataset 5"
combined_data <- rbind(mice1, mice2, mice3, mice4, mice5)
models <- split(combined_data, combined_data$dataset) %>%
map(~ glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout + year, data = ., family = binomial(link=logit)))
plot_data <- map_dfr(models, broom::tidy, .id = "dataset")
ggplot(plot_data, aes(x = term, y = estimate, color = dataset)) +
geom_point() +
geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.2) +
theme_minimal() + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))
Pooling the imputed data sets
Rubin’s Rules consist of three steps: a. Combine the Estimates: - Calculate the mean of the estimates from each dataset to obtain a pooled estimate. b. Within-Imputation Variance: - Calculate the average of the variances of the estimates from each dataset. c. Between-Imputation Variance: - Calculate the variance of the estimates across the datasets.
The total variance of the pooled estimate is the sum of the within-imputation variance and the between-imputation variance. This total variance is used to calculate confidence intervals and perform hypothesis tests.
fits <- lapply(list(mice1, mice2, mice3, mice4, mice5), function(data) {
glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout + year,
family = binomial(link = logit), data = data) })
pooled_results <- pool(fits)
summary(pooled_results)
## term estimate std.error statistic df p.value
## 1 (Intercept) 76.505640557 1.4719473039 51.975801 435.62782 7.205414e-189
## 2 age2 -0.002136738 0.0010921342 -1.956479 199.51943 5.180425e-02
## 3 daytime1 -0.105629747 0.0047350305 -22.308145 32.70059 2.345945e-21
## 4 inout21 -0.305131859 0.0049463252 -61.688596 1097.65590 0.000000e+00
## 5 ac_incid1 -0.172769692 0.0037836336 -45.662374 337896.36350 0.000000e+00
## 6 offunif21 -0.327961710 0.0040851215 -80.281998 2578.72688 0.000000e+00
## 7 cs_furtv1 0.424573651 0.0039458263 107.600696 975.63208 0.000000e+00
## 8 cs_lkout1 -0.242327615 0.0051538063 -47.019155 183889.57772 0.000000e+00
## 9 year -0.038526211 0.0007332462 -52.541982 436.69020 6.737645e-191
fits <- lapply(list(mice1, mice2, mice3, mice4, mice5), function(data) {
glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout,
family = binomial(link = logit), data = data) })
pooled_results <- pool(fits)
summary(pooled_results)
## term estimate std.error statistic df p.value
## 1 (Intercept) -0.82443676 0.020609885 -40.002006 123.84117 1.131550e-72
## 2 age2 -0.00207628 0.001087369 -1.909452 216.44996 5.752608e-02
## 3 daytime1 -0.12324138 0.004574285 -26.942219 41.82147 4.713664e-28
## 4 inout21 -0.30387964 0.004937604 -61.543941 1162.36248 0.000000e+00
## 5 ac_incid1 -0.18612627 0.003773149 -49.329160 266548.88025 0.000000e+00
## 6 offunif21 -0.34171987 0.004060010 -84.167249 3667.77577 0.000000e+00
## 7 cs_furtv1 0.38695123 0.003896823 99.299152 714.54784 0.000000e+00
## 8 cs_lkout1 -0.25230130 0.005145107 -49.037132 191471.46264 0.000000e+00
est1 <- summary(fits[[1]])$coefficients[,1]
est2 <- summary(fits[[2]])$coefficients[,1]
est3 <- summary(fits[[3]])$coefficients[,1]
est4 <- summary(fits[[4]])$coefficients[,1]
est5 <- summary(fits[[5]])$coefficients[,1]
pool <- summary(pooled_results)$estimate
cbind(est1,est2,est3,est4,est5,pool)
## est1 est2 est3 est4 est5
## (Intercept) -0.818488383 -0.835824062 -0.824162938 -0.815690404 -0.828018006
## age2 -0.002420985 -0.001548769 -0.002115183 -0.002400004 -0.001896457
## daytime1 -0.125052486 -0.120756028 -0.123369800 -0.125958404 -0.121070188
## inout21 -0.303442907 -0.303096167 -0.303209032 -0.305753382 -0.303896698
## ac_incid1 -0.186238958 -0.185765052 -0.186155166 -0.186232445 -0.186239707
## offunif21 -0.342267140 -0.341523545 -0.340664646 -0.342309572 -0.341834428
## cs_furtv1 0.388029540 0.387548085 0.387058893 0.385482562 0.386637051
## cs_lkout1 -0.252237630 -0.252189893 -0.252105262 -0.252129678 -0.252844024
## pool
## (Intercept) -0.82443676
## age2 -0.00207628
## daytime1 -0.12324138
## inout21 -0.30387964
## ac_incid1 -0.18612627
## offunif21 -0.34171987
## cs_furtv1 0.38695123
## cs_lkout1 -0.25230130
coefficients_matrix <- matrix(c(est1, est2, est3, est4, est5), 8, 5)
median_coefficients <- apply(coefficients_matrix, 1, median)
median_index <- which.min(median_coefficients)
median_data_set <- paste0("est", median_index)
cat("The data set with median coefficients is:", median_data_set, "\n")
## The data set with median coefficients is: est1
add_model <- glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout,
family = binomial(link = logit), data = mice1)
drop1(add_model, test = "Chisq")
## Single term deletions
##
## Model:
## bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv +
## cs_lkout
## Df Deviance AIC LRT Pr(>Chi)
## <none> 1764236 1764252
## age2 1 1764242 1764256 5.7 0.01661 *
## daytime 1 1765323 1765337 1086.8 < 2e-16 ***
## inout2 1 1768390 1768404 4153.6 < 2e-16 ***
## ac_incid 1 1766678 1766692 2441.4 < 2e-16 ***
## offunif2 1 1771465 1771479 7228.5 < 2e-16 ***
## cs_furtv 1 1774991 1775005 10755.0 < 2e-16 ***
## cs_lkout 1 1766726 1766740 2489.9 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
trainErr <- function(mod) {
mean(residuals(mod, type = "deviance") ^ 2)
}
form0 <- bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_lkout + cs_furtv
form1 <- bin_force ~ age2*daytime + inout2 + ac_incid + offunif2 + cs_lkout + cs_furtv
form2 <- bin_force ~ age2 + daytime*cs_lkout + inout2 + ac_incid + offunif2 + cs_furtv
form3 <- bin_force ~ age2 + daytime*cs_furtv + inout2 + ac_incid + offunif2 + cs_lkout
form4 <- bin_force ~ inout2*age2 + daytime + ac_incid + offunif2 + cs_lkout + cs_furtv
form5 <- bin_force ~ age2 + daytime + ac_incid + offunif2 + inout2*cs_lkout + cs_furtv
form6 <- bin_force ~ age2 + daytime + ac_incid + offunif2 + cs_lkout + inout2*cs_furtv
form7 <- bin_force ~ age2*ac_incid + daytime + inout2 + offunif2 + cs_lkout + cs_furtv
form8 <- bin_force ~ age2 + daytime + inout2 + offunif2 + cs_lkout*ac_incid + cs_furtv
form9 <- bin_force ~ age2 + daytime + inout2 + offunif2 + cs_lkout + cs_furtv*ac_incid
form10 <- bin_force ~ age2*offunif2 + daytime + inout2 + ac_incid + cs_lkout + cs_furtv
form11 <- bin_force ~ age2 + daytime + inout2 + ac_incid + cs_lkout*offunif2 + cs_furtv
form12 <- bin_force ~ age2 + daytime + inout2 + ac_incid + cs_lkout + cs_furtv*offunif2
form13 <- bin_force ~ age2 + daytime + inout2 + ac_incid + cs_lkout*cs_furtv*offunif2
form14 <- bin_force ~ age2 + daytime + inout2 + cs_furtv*offunif2*ac_incid + cs_lkout
form15 <- bin_force ~ age2 + daytime + cs_furtv*offunif2*inout2 + ac_incid + cs_lkout
form16 <- bin_force ~ age2 + cs_furtv*offunif2*daytime + inout2 + ac_incid + cs_lkout
form17 <- bin_force ~ cs_furtv*offunif2*age2 + daytime + inout2 + ac_incid + cs_lkout
form18 <- bin_force ~ age2*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2)
form19 <- bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2)
sapply(list(form0, form1, form2, form3, form4, form5, form6, form7, form8, form9, form10, form11, form12, form13, form14, form15, form16, form17, form18, form19), function(form){trainErr(glm(form, data = mice1, family = binomial(link = "logit")))})
## [1] 1.077205 1.077204 1.077203 1.077165 1.077194 1.077006 1.077101 1.077193
## [9] 1.076873 1.076300 1.077205 1.077127 1.077171 1.076194 1.076083 1.076182
## [17] 1.077116 1.077131 1.075485 1.075439
cv_data <- mice1 %>%
mutate(bin_force = as.numeric(as.character(bin_force)))
testCV <- function(form, data, B = 1, k = 5) {
n <- nrow(data)
y <- data$bin_force
PEcv <- vector("list", B)
tmp <- numeric(n)
for(b in 1:B) {
group <- sample(rep(1:k, length.out = n))
for(i in 1:k) {
modelcv <- glm(form, family=binomial("logit"), data = data[group != i, ]) # should be changed
muhat <- predict(modelcv, newdata = data[group == i, ], type = "response") # "reponse" takes the inverse of the link
y_cv <- y[group == i]
tmp[group == i] <- -(2*(y_cv*log(muhat)+(1-y_cv)*log(1-muhat)))
}
PEcv[[b]] <- tmp
}
mean(unlist(PEcv))
}
sapply(list(form0,form1, form2, form3, form4, form5, form6, form7, form8, form9, form10, form11, form12, form13, form14, form15, form16, form17, form18, form19), function(form){testCV(form, cv_data)})
## [1] 1.077219 1.077216 1.077214 1.077183 1.077204 1.077013 1.077119 1.077210
## [9] 1.076889 1.076311 1.077213 1.077143 1.077187 1.076211 1.076100 1.076198
## [17] 1.077128 1.077149 1.075538 1.075492
int_model <- glm(bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2),
family = binomial(link = logit), data = mice1)
int_model_no_spl <- glm(bin_force ~ age2*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2),
family = binomial(link = logit), data = mice1)
anova(int_model_no_spl, int_model, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: bin_force ~ age2 * (cs_furtv + cs_lkout) * (daytime + inout2 +
## ac_incid + offunif2)
## Model 2: bin_force ~ ns(age2, df = 2) * (cs_furtv + cs_lkout) * (daytime +
## inout2 + ac_incid + offunif2)
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1637761 1761420
## 2 1637746 1761344 15 76.08 3.611e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
int_model <- glm(bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2),
family = binomial(link = logit), data = mice1)
summary(int_model)
##
## Call:
## glm(formula = bin_force ~ ns(age2, df = 2) * (cs_furtv + cs_lkout) *
## (daytime + inout2 + ac_incid + offunif2), family = binomial(link = logit),
## data = mice1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.787338 0.068099 -11.562 < 2e-16 ***
## ns(age2, df = 2)1 -0.023967 0.133242 -0.180 0.857249
## ns(age2, df = 2)2 -0.038529 0.022103 -1.743 0.081307 .
## cs_furtv1 0.189492 0.096130 1.971 0.048702 *
## cs_lkout1 -0.250573 0.144289 -1.737 0.082457 .
## daytime1 -0.126301 0.055835 -2.262 0.023696 *
## inout21 -0.370558 0.066126 -5.604 2.10e-08 ***
## ac_incid1 -0.535294 0.057190 -9.360 < 2e-16 ***
## offunif21 -0.291377 0.063500 -4.589 4.46e-06 ***
## ns(age2, df = 2)1:cs_furtv1 0.154328 0.187419 0.823 0.410257
## ns(age2, df = 2)2:cs_furtv1 0.050610 0.030288 1.671 0.094729 .
## ns(age2, df = 2)1:cs_lkout1 -0.266411 0.281059 -0.948 0.343190
## ns(age2, df = 2)2:cs_lkout1 0.001139 0.044856 0.025 0.979746
## ns(age2, df = 2)1:daytime1 0.058915 0.109676 0.537 0.591150
## ns(age2, df = 2)2:daytime1 -0.006084 0.018817 -0.323 0.746439
## ns(age2, df = 2)1:inout21 -0.030500 0.130124 -0.234 0.814680
## ns(age2, df = 2)2:inout21 -0.070912 0.022170 -3.199 0.001381 **
## ns(age2, df = 2)1:ac_incid1 0.336878 0.112156 3.004 0.002667 **
## ns(age2, df = 2)2:ac_incid1 0.046820 0.018778 2.493 0.012654 *
## ns(age2, df = 2)1:offunif21 0.003367 0.124426 0.027 0.978410
## ns(age2, df = 2)2:offunif21 -0.023409 0.020692 -1.131 0.257926
## cs_furtv1:daytime1 -0.032376 0.079574 -0.407 0.684107
## cs_furtv1:inout21 -0.009905 0.099383 -0.100 0.920607
## cs_furtv1:ac_incid1 0.268687 0.080100 3.354 0.000795 ***
## cs_furtv1:offunif21 0.005100 0.087986 0.058 0.953781
## cs_lkout1:daytime1 -0.068767 0.113664 -0.605 0.545174
## cs_lkout1:inout21 0.184822 0.140299 1.317 0.187725
## cs_lkout1:ac_incid1 0.207309 0.121317 1.709 0.087484 .
## cs_lkout1:offunif21 -0.233937 0.119859 -1.952 0.050966 .
## ns(age2, df = 2)1:cs_furtv1:daytime1 -0.036447 0.155870 -0.234 0.815120
## ns(age2, df = 2)2:cs_furtv1:daytime1 -0.013405 0.025871 -0.518 0.604364
## ns(age2, df = 2)1:cs_furtv1:inout21 0.262551 0.195227 1.345 0.178674
## ns(age2, df = 2)2:cs_furtv1:inout21 0.032966 0.032473 1.015 0.310018
## ns(age2, df = 2)1:cs_furtv1:ac_incid1 0.028586 0.156771 0.182 0.855314
## ns(age2, df = 2)2:cs_furtv1:ac_incid1 -0.015991 0.025684 -0.623 0.533561
## ns(age2, df = 2)1:cs_furtv1:offunif21 -0.129535 0.171827 -0.754 0.450929
## ns(age2, df = 2)2:cs_furtv1:offunif21 0.051800 0.027822 1.862 0.062620 .
## ns(age2, df = 2)1:cs_lkout1:daytime1 0.110776 0.222081 0.499 0.617916
## ns(age2, df = 2)2:cs_lkout1:daytime1 0.002142 0.035894 0.060 0.952424
## ns(age2, df = 2)1:cs_lkout1:inout21 0.148753 0.275568 0.540 0.589333
## ns(age2, df = 2)2:cs_lkout1:inout21 -0.011218 0.045427 -0.247 0.804946
## ns(age2, df = 2)1:cs_lkout1:ac_incid1 0.085850 0.237168 0.362 0.717366
## ns(age2, df = 2)2:cs_lkout1:ac_incid1 0.018166 0.038240 0.475 0.634746
## ns(age2, df = 2)1:cs_lkout1:offunif21 0.226683 0.233823 0.969 0.332313
## ns(age2, df = 2)2:cs_lkout1:offunif21 -0.001238 0.037254 -0.033 0.973493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1795133 on 1637790 degrees of freedom
## Residual deviance: 1761344 on 1637746 degrees of freedom
## AIC: 1761434
##
## Number of Fisher Scoring iterations: 4
predFrame <- expand.grid(cs_furtv = factor(c(0,1)),
ac_incid = factor(c(0,1)),
daytime = factor(c(0,1)),
inout2 = factor(c(0,1)),
offunif2 = factor(c(0,1)),
cs_lkout = factor(c(0,1)),
age2 = 10:21)
predFrame$predicted_prob <- predict(int_model, type = "response", newdata = predFrame, interval = "confidence")
ggplot(predFrame, aes(x = age2, y = predicted_prob)) +
facet_grid(cs_lkout + cs_furtv ~ ac_incid + daytime + inout2 + offunif2, label = label_both) +
#geom_ribbon(aes(ymin = lwr, ymax = upr), fill = gray(0.85)) +
geom_line(color = "cadetblue") + coord_cartesian(ylim = c(0.1,0.4))
#Residualplot of the final model
cv_data <- mice1 %>%
mutate(bin_force = as.numeric(as.character(bin_force)))
final_model <- glm(bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial)
simDiag <- fortify(final_model)
ggplot(simDiag, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
set.seed(2023)
yNew1 <- simulate(final_model)[,1]
yNew2 <- simulate(final_model)[,1]
yNew3 <- simulate(final_model)[,1]
yNew4 <- simulate(final_model)[,1]
simGlmNew1 <- glm(yNew1 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simGlmNew2 <- glm(yNew2 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simGlmNew3 <- glm(yNew3 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simGlmNew4 <- glm(yNew4 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simDiagNew1 <- fortify(simGlmNew1)
simDiagNew2 <- fortify(simGlmNew2)
simDiagNew3 <- fortify(simGlmNew3)
simDiagNew4 <- fortify(simGlmNew4)
p1 <- qplot(.fitted, .resid, data = simDiagNew1) +
geom_smooth()
p2 <- qplot(.fitted, .resid, data = simDiagNew2) +
geom_smooth()
p3 <- qplot(.fitted, .resid, data = simDiagNew3) +
geom_smooth()
p4 <- qplot(.fitted, .resid, data = simDiagNew4) +
geom_smooth()
grid.arrange(p1,p2,p3,p4, ncol=2)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'