Load and mutate data

data <- d2 %>%
  select(-c("force2", "pct")) %>%
  mutate_at(vars(all_of(yes_no_columns)), ~ifelse(. == "N", 0, 1))%>%
  mutate(race2 = case_when(
    race2 == "white"    ~ 0,
    race2 == "black"    ~ 1,
    race2 == "hisp"     ~ 2,
    race2 == "asian"    ~ 3,
    race2 == "other"    ~ 4
  )) %>%
  mutate(typeofid2  = case_when(
    typeofid2 == "O"    ~ 0,
    typeofid2 == "P"    ~ 1,
    typeofid2 == "R"    ~ 2,
    typeofid2 == "V"    ~ 3,
  )) %>%
  mutate(across(all_of(factor_columns), as.factor)) %>%
  mutate(year = as.numeric(year))

summary(data)

##      force          race2          gender             age2       daytime       
##  0      :3910239   0   : 492688   0   :4495436   Min.   :10      0   :2472074  
##  1      : 736051   1   :2886843   1   : 348159   1st Qu.:19      1   :1871287  
##  3      : 156732   2   :1215401   NA's: 140796   Median :24      NA's: 641030  
##  2      : 122898   3   : 152053                  Mean   :28                    
##  5      :  31590   4   : 235105                  3rd Qu.:34                    
##  6      :  17769   NA's:   2301                  Max.   :90                    
##  (Other):   9112                                 NA's   :41534                 
##   inout2        ac_incid    ac_time     offunif2       typeofid2     
##  0   :3809789   0:2204888   0:3158281   0   :1396968   0   :  84217  
##  1   :1147150   1:2779503   1:1826110   1   :3586909   1   :2640092  
##  NA's:  27452                           NA's:    514   2   : 107497  
##                                                        3   :2133368  
##                                                        NA's:  19217  
##                                                                      
##                                                                      
##  othpers2       cs_objcs    cs_descr    cs_casng    cs_lkout    cs_cloth   
##  0   :3806808   0:4850690   0:4116587   0:3562617   0:4151151   0:4773826  
##  1   :1163298   1: 133701   1: 867804   1:1421774   1: 833240   1: 210565  
##  NA's:  14285                                                              
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##  cs_drgtr    cs_furtv    cs_vcrim    cs_bulge    cs_other         year     
##  0:4522192   0:2801105   0:4588325   0:4543022   0:3944522   Min.   :2003  
##  1: 462199   1:2183286   1: 396066   1: 441369   1:1039869   1st Qu.:2006  
##                                                              Median :2009  
##                                                              Mean   :2008  
##                                                              3rd Qu.:2011  
##                                                              Max.   :2013  
##

Full data set exploratory analysis

Histograms of all the covariates except age and year which are seen below.

We make a Spearman correlation for all the covariates except for the two we discarded.

cp <- cor(data.matrix(na.omit(data)), method = "spearman")

ord <- rev(hclust(as.dist(1-abs(cp)))$order)

colPal <- colorRampPalette(c("white", "darkseagreen4"), space = "rgb")(100)

spearman <- levelplot(cp[ord, ord], xlab = "", ylab = "",
col.regions = colPal, at = seq(-1, 1, length.out = 100),
colorkey = list(space = "top", labels = list(cex = 0.5)),
scales = list(x = list(rot = 45),
y = list(rot = 45),
cex = 0.5))

spearman

We now define a new data set where force id 0 if there is no force, and 1 if there is any kind of force. This is because we later want to fit a binomial model.

table(data$force, data$year)

##    
##       2003   2004   2005   2006   2007   2008   2009   2010   2011   2012
##   0 119564 246398 312773 406541 365744 414700 439271 463337 540799 443857
##   1  21225  41392  56400  69177  75406  91045 103625  97624 102471  57803
##   2   8134  10412  12542    653  13097  14096  16467  16444  16978  10099
##   3   7938  10637  11812  11366  14079  16265  17690  19265  21076  17697
##   4    912   1010    937    631    612    496    481    498    641    712
##   5   1166   1579   1684  16069   1369   1792   1831   2128   1845   1418
##   6   1787   1954   1808   1868   1592   1747   1609   1807   1730   1215
##   7    125    141    235    184    197    161    194    182    184    110
##    
##       2013
##   0 157255
##   1  19883
##   2   3976
##   3   8907
##   4    435
##   5    709
##   6    652
##   7     34

Not many force 4-observations. maybe it’s not a big difference for the police to go from “4: at least drawing a weapon on a civilian.” to “5: at least pushing a civilian to the ground.” Which gives reason for the choice of joining some “forces” and fitting a binomial model.

Data analysis

Analysis of missing values

How can we replace the missing values? Maybe we should do this only for the columns we are interested in and not all of them.

Diagnostistke NA-plot på hele datasættet??

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## Loading required package: colorspace

## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, were retired in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

## 
## Attaching package: 'ggmice'

## The following objects are masked from 'package:mice':
## 
##     bwplot, densityplot, stripplot, xyplot

## The following objects are masked from 'package:lattice':
## 
##     bwplot, densityplot, stripplot, xyplot

NAplot <- md.pattern(modeldata,
           rotate.names = TRUE)

pMiss <- function(x){sum(is.na(x))/length(x)*100}
apply(data,2,pMiss)

##       force       race2      gender        age2     daytime      inout2 
##  0.00000000  0.04616412  2.82473827  0.83328134 12.86074869  0.55075936 
##    ac_incid     ac_time    offunif2   typeofid2    othpers2    cs_objcs 
##  0.00000000  0.00000000  0.01031219  0.38554359  0.28659469  0.00000000 
##    cs_descr    cs_casng    cs_lkout    cs_cloth    cs_drgtr    cs_furtv 
##  0.00000000  0.00000000  0.00000000  0.00000000  0.00000000  0.00000000 
##    cs_vcrim    cs_bulge    cs_other        year 
##  0.00000000  0.00000000  0.00000000  0.00000000

#apply(modeldata,1,pMiss)

aggr_plot <- aggr(data, col=c('cadetblue','darkseagreen2'), 
                  numbers=TRUE, 
                  sortVars=TRUE, 
                  labels=names(modeldata), 
                  cex.axis=.7, gap=3, 
                  ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable        Count
##  ac_incid 0.1286074869
##   daytime 0.0282473827
##    inout2 0.0083328134
##  offunif2 0.0055075936
##      age2 0.0038554359
##   daytime 0.0028659469
##      age2 0.0004616412
##     force 0.0001031219
##     force 0.0000000000
##  cs_furtv 0.0000000000
##  cs_lkout 0.0000000000
##    inout2 0.0000000000
##  ac_incid 0.0000000000
##  offunif2 0.0000000000
##  cs_furtv 0.0000000000
##  cs_lkout 0.0000000000
##     force 0.0000000000
##      age2 0.0000000000
##   daytime 0.0000000000
##    inout2 0.0000000000
##  ac_incid 0.0000000000
##  offunif2 0.0000000000

We dont see a strong pattern of how the values are missing.

12% of data is missing “daytime”. We investigate how the data is missing:

We suspect some correlation between daytime an ac_time stratified on force.

not a strong correlation, but this may indicate, that the values are missing at random

Cross tabulation:

cross_tab <- table(data$year, data$daytime, useNA = "ifany")
row_sums <- rowSums(cross_tab, na.rm = TRUE)
cross_tab_with_sums <- cbind(cross_tab, row_sums)

print(cross_tab_with_sums)

##           0      1   <NA> row_sums
## 2003 100148  60686     17   160851
## 2004 201078 112376     69   313523
## 2005 250292 147348    551   398191
## 2006 243565 185416  77508   506489
## 2007 232487 162968  76641   472096
## 2008 261735 191513  87054   540302
## 2009 266595 228727  85846   581168
## 2010 278109 225644  97532   601285
## 2011 307707 275828 102189   685724
## 2012 242830 208138  81943   532911
## 2013  87528  72643  31680   191851

From the cross tabulation we see that in year 2003, 2004 and 2005 there are not many missing values ratio between day and night for these years are quite similar to the ratio in the following years when not taking the NA’s into account. This indicates once more, that the values are missing at random.

Imputed data model fitting

modeldata_imp <- mice(modeldata,m=5,maxit=50,meth='pmm',seed=500)
summary(modeldata_imp)

#watch imputed data
modeldata_imp$imp$daytime

#puts imputed data from 1st bootstrap back into dataset
modeldata_mice1 <- complete(modeldata_imp, 1)
modeldata_mice2 <- complete(modeldata_imp, 2)
modeldata_mice3 <- complete(modeldata_imp, 3)
modeldata_mice4 <- complete(modeldata_imp, 4)
modeldata_mice5 <- complete(modeldata_imp, 5)

Histogram of imputed data

df_orig <- data.frame(mice1$daytime, mice2$daytime, mice3$daytime, mice4$daytime, mice5$daytime, modeldata_orig$daytime)
df_orig_long <- df_orig %>%
  pivot_longer(everything(), names_to = "Dataset", values_to = "BinaryValue")

# Create a histogram with 
p_orig <- ggplot(df_orig_long, aes(x = BinaryValue, fill = Dataset)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("mice1.daytime" = "darkseagreen", "mice2.daytime" = "darkseagreen1", "mice3.daytime" = "darkseagreen4", "mice4.daytime" = "darkseagreen3", "mice5.daytime" = "darkseagreen2", "modeldata.daytime" = "cadetblue3")) +
  labs(fill = "Dataset") +
  theme(legend.position = "top", text = element_text(size =7))



df_mice <- data.frame(mice1$daytime, mice2$daytime, mice3$daytime, mice4$daytime, mice5$daytime)
df_mice_long <- df_mice %>%
  pivot_longer(everything(), names_to = "Dataset", values_to = "BinaryValue")

# Create a histogram without original in comparison
p_mice <- ggplot(df_mice_long, aes(x = BinaryValue, fill = Dataset)) +
  geom_bar(position = "dodge") +
  scale_fill_manual(values = c("mice1.daytime" = "darkseagreen", "mice2.daytime" = "darkseagreen1", "mice3.daytime" = "darkseagreen4", "mice4.daytime" = "darkseagreen3", "mice5.daytime" = "darkseagreen2")) +
  labs(fill = "Dataset") +
  theme(legend.position = "top", text = element_text(size =7))

grid.arrange(p_orig, p_mice, ncol = 2)

## Warning: Removed 641030 rows containing non-finite values (`stat_count()`).

Remember to import data!

Estimate reconcilation

mice1$dataset <- "Dataset 1"
mice2$dataset <- "Dataset 2"
mice3$dataset <- "Dataset 3"
mice4$dataset <- "Dataset 4"
mice5$dataset <- "Dataset 5"

combined_data <- rbind(mice1, mice2, mice3, mice4, mice5)

models <- split(combined_data, combined_data$dataset) %>% 
  map(~ glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout + year, data = ., family = binomial(link=logit)))

plot_data <- map_dfr(models, broom::tidy, .id = "dataset")

ggplot(plot_data, aes(x = term, y = estimate, color = dataset)) +
  geom_point() +
  geom_errorbar(aes(ymin = estimate - std.error, ymax = estimate + std.error), width = 0.2) +
  theme_minimal() + theme(axis.text.x = element_text(angle = 60, vjust = 1, hjust=1))

Pooling the imputed data sets

Rubin’s Rules consist of three steps: a. Combine the Estimates: - Calculate the mean of the estimates from each dataset to obtain a pooled estimate. b. Within-Imputation Variance: - Calculate the average of the variances of the estimates from each dataset. c. Between-Imputation Variance: - Calculate the variance of the estimates across the datasets.

The total variance of the pooled estimate is the sum of the within-imputation variance and the between-imputation variance. This total variance is used to calculate confidence intervals and perform hypothesis tests.

fits <- lapply(list(mice1, mice2, mice3, mice4, mice5), function(data) {
  glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout + year, 
      family = binomial(link = logit), data = data) })

pooled_results <- pool(fits)
summary(pooled_results)

##          term     estimate    std.error  statistic           df       p.value
## 1 (Intercept) 76.505640557 1.4719473039  51.975801    435.62782 7.205414e-189
## 2        age2 -0.002136738 0.0010921342  -1.956479    199.51943  5.180425e-02
## 3    daytime1 -0.105629747 0.0047350305 -22.308145     32.70059  2.345945e-21
## 4     inout21 -0.305131859 0.0049463252 -61.688596   1097.65590  0.000000e+00
## 5   ac_incid1 -0.172769692 0.0037836336 -45.662374 337896.36350  0.000000e+00
## 6   offunif21 -0.327961710 0.0040851215 -80.281998   2578.72688  0.000000e+00
## 7   cs_furtv1  0.424573651 0.0039458263 107.600696    975.63208  0.000000e+00
## 8   cs_lkout1 -0.242327615 0.0051538063 -47.019155 183889.57772  0.000000e+00
## 9        year -0.038526211 0.0007332462 -52.541982    436.69020 6.737645e-191

Analysis on imputed datasets

fits <- lapply(list(mice1, mice2, mice3, mice4, mice5), function(data) {
  glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout, 
      family = binomial(link = logit), data = data) })

pooled_results <- pool(fits)
summary(pooled_results)

##          term    estimate   std.error  statistic           df      p.value
## 1 (Intercept) -0.82443676 0.020609885 -40.002006    123.84117 1.131550e-72
## 2        age2 -0.00207628 0.001087369  -1.909452    216.44996 5.752608e-02
## 3    daytime1 -0.12324138 0.004574285 -26.942219     41.82147 4.713664e-28
## 4     inout21 -0.30387964 0.004937604 -61.543941   1162.36248 0.000000e+00
## 5   ac_incid1 -0.18612627 0.003773149 -49.329160 266548.88025 0.000000e+00
## 6   offunif21 -0.34171987 0.004060010 -84.167249   3667.77577 0.000000e+00
## 7   cs_furtv1  0.38695123 0.003896823  99.299152    714.54784 0.000000e+00
## 8   cs_lkout1 -0.25230130 0.005145107 -49.037132 191471.46264 0.000000e+00

est1 <- summary(fits[[1]])$coefficients[,1]
est2 <- summary(fits[[2]])$coefficients[,1]
est3 <- summary(fits[[3]])$coefficients[,1]
est4 <- summary(fits[[4]])$coefficients[,1]
est5 <- summary(fits[[5]])$coefficients[,1]
pool <- summary(pooled_results)$estimate

cbind(est1,est2,est3,est4,est5,pool)

##                     est1         est2         est3         est4         est5
## (Intercept) -0.818488383 -0.835824062 -0.824162938 -0.815690404 -0.828018006
## age2        -0.002420985 -0.001548769 -0.002115183 -0.002400004 -0.001896457
## daytime1    -0.125052486 -0.120756028 -0.123369800 -0.125958404 -0.121070188
## inout21     -0.303442907 -0.303096167 -0.303209032 -0.305753382 -0.303896698
## ac_incid1   -0.186238958 -0.185765052 -0.186155166 -0.186232445 -0.186239707
## offunif21   -0.342267140 -0.341523545 -0.340664646 -0.342309572 -0.341834428
## cs_furtv1    0.388029540  0.387548085  0.387058893  0.385482562  0.386637051
## cs_lkout1   -0.252237630 -0.252189893 -0.252105262 -0.252129678 -0.252844024
##                    pool
## (Intercept) -0.82443676
## age2        -0.00207628
## daytime1    -0.12324138
## inout21     -0.30387964
## ac_incid1   -0.18612627
## offunif21   -0.34171987
## cs_furtv1    0.38695123
## cs_lkout1   -0.25230130

coefficients_matrix <- matrix(c(est1, est2, est3, est4, est5), 8, 5)

median_coefficients <- apply(coefficients_matrix, 1, median)

median_index <- which.min(median_coefficients)

median_data_set <- paste0("est", median_index)

cat("The data set with median coefficients is:", median_data_set, "\n")

## The data set with median coefficients is: est1

Drop1 for additive model

add_model <- glm(bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + cs_lkout, 
      family = binomial(link = logit), data = mice1)

drop1(add_model, test = "Chisq")

## Single term deletions
## 
## Model:
## bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_furtv + 
##     cs_lkout
##          Df Deviance     AIC     LRT Pr(>Chi)    
## <none>       1764236 1764252                     
## age2      1  1764242 1764256     5.7  0.01661 *  
## daytime   1  1765323 1765337  1086.8  < 2e-16 ***
## inout2    1  1768390 1768404  4153.6  < 2e-16 ***
## ac_incid  1  1766678 1766692  2441.4  < 2e-16 ***
## offunif2  1  1771465 1771479  7228.5  < 2e-16 ***
## cs_furtv  1  1774991 1775005 10755.0  < 2e-16 ***
## cs_lkout  1  1766726 1766740  2489.9  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Training error and cross validation on different forms

trainErr <- function(mod) {
  mean(residuals(mod, type = "deviance") ^ 2)
}

form0 <- bin_force ~ age2 + daytime + inout2 + ac_incid + offunif2 + cs_lkout + cs_furtv


form1 <- bin_force ~ age2*daytime + inout2 + ac_incid + offunif2 + cs_lkout + cs_furtv

form2 <- bin_force ~ age2 + daytime*cs_lkout + inout2 + ac_incid + offunif2 + cs_furtv

form3 <- bin_force ~ age2 + daytime*cs_furtv + inout2 + ac_incid + offunif2 + cs_lkout


form4 <- bin_force ~ inout2*age2 + daytime + ac_incid + offunif2 + cs_lkout + cs_furtv

form5 <- bin_force ~ age2 + daytime + ac_incid + offunif2 + inout2*cs_lkout + cs_furtv

form6 <- bin_force ~ age2 + daytime + ac_incid + offunif2 + cs_lkout + inout2*cs_furtv


form7 <- bin_force ~ age2*ac_incid + daytime + inout2 + offunif2 + cs_lkout + cs_furtv

form8 <- bin_force ~ age2 + daytime + inout2 + offunif2 + cs_lkout*ac_incid + cs_furtv

form9 <- bin_force ~ age2 + daytime + inout2 + offunif2 + cs_lkout + cs_furtv*ac_incid


form10 <- bin_force ~ age2*offunif2 + daytime + inout2 + ac_incid + cs_lkout + cs_furtv

form11 <- bin_force ~ age2 + daytime + inout2 + ac_incid + cs_lkout*offunif2 + cs_furtv

form12 <- bin_force ~ age2 + daytime + inout2 + ac_incid + cs_lkout + cs_furtv*offunif2 


form13 <- bin_force ~ age2 + daytime + inout2 + ac_incid + cs_lkout*cs_furtv*offunif2 

form14 <- bin_force ~ age2 + daytime + inout2 + cs_furtv*offunif2*ac_incid + cs_lkout 

form15 <- bin_force ~ age2 + daytime + cs_furtv*offunif2*inout2 + ac_incid + cs_lkout 

form16 <- bin_force ~ age2 + cs_furtv*offunif2*daytime + inout2 + ac_incid + cs_lkout 

form17 <- bin_force ~ cs_furtv*offunif2*age2 + daytime + inout2 + ac_incid + cs_lkout 

form18 <- bin_force ~ age2*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2)

form19 <- bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2)


sapply(list(form0, form1, form2, form3, form4, form5, form6, form7, form8, form9, form10, form11, form12, form13, form14, form15, form16, form17, form18, form19), function(form){trainErr(glm(form, data = mice1, family = binomial(link = "logit")))})

##  [1] 1.077205 1.077204 1.077203 1.077165 1.077194 1.077006 1.077101 1.077193
##  [9] 1.076873 1.076300 1.077205 1.077127 1.077171 1.076194 1.076083 1.076182
## [17] 1.077116 1.077131 1.075485 1.075439

cv_data <- mice1 %>%
  mutate(bin_force = as.numeric(as.character(bin_force)))


testCV <- function(form, data, B = 1, k = 5) {
  n <- nrow(data)
  y <- data$bin_force
  PEcv <- vector("list", B)
  tmp <- numeric(n)
  for(b in 1:B) {
    
    group <- sample(rep(1:k, length.out = n))
    for(i in 1:k) {
      modelcv <- glm(form, family=binomial("logit"), data = data[group != i, ]) # should be changed
      
      muhat <- predict(modelcv, newdata = data[group == i, ], type = "response") # "reponse" takes the inverse of the link
       y_cv <- y[group == i]
       
       tmp[group == i] <- -(2*(y_cv*log(muhat)+(1-y_cv)*log(1-muhat)))
  
    }
    PEcv[[b]] <- tmp
  }
  mean(unlist(PEcv))
}



sapply(list(form0,form1, form2, form3, form4, form5, form6, form7, form8, form9, form10, form11, form12, form13, form14, form15, form16, form17, form18, form19), function(form){testCV(form, cv_data)})

##  [1] 1.077219 1.077216 1.077214 1.077183 1.077204 1.077013 1.077119 1.077210
##  [9] 1.076889 1.076311 1.077213 1.077143 1.077187 1.076211 1.076100 1.076198
## [17] 1.077128 1.077149 1.075538 1.075492

Interaction model

int_model <- glm(bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), 
      family = binomial(link = logit), data = mice1)

int_model_no_spl <- glm(bin_force ~ age2*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), 
      family = binomial(link = logit), data = mice1)

anova(int_model_no_spl, int_model, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: bin_force ~ age2 * (cs_furtv + cs_lkout) * (daytime + inout2 + 
##     ac_incid + offunif2)
## Model 2: bin_force ~ ns(age2, df = 2) * (cs_furtv + cs_lkout) * (daytime + 
##     inout2 + ac_incid + offunif2)
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1   1637761    1761420                          
## 2   1637746    1761344 15    76.08 3.611e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

int_model <- glm(bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), 
      family = binomial(link = logit), data = mice1)

summary(int_model)

## 
## Call:
## glm(formula = bin_force ~ ns(age2, df = 2) * (cs_furtv + cs_lkout) * 
##     (daytime + inout2 + ac_incid + offunif2), family = binomial(link = logit), 
##     data = mice1)
## 
## Coefficients:
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           -0.787338   0.068099 -11.562  < 2e-16 ***
## ns(age2, df = 2)1                     -0.023967   0.133242  -0.180 0.857249    
## ns(age2, df = 2)2                     -0.038529   0.022103  -1.743 0.081307 .  
## cs_furtv1                              0.189492   0.096130   1.971 0.048702 *  
## cs_lkout1                             -0.250573   0.144289  -1.737 0.082457 .  
## daytime1                              -0.126301   0.055835  -2.262 0.023696 *  
## inout21                               -0.370558   0.066126  -5.604 2.10e-08 ***
## ac_incid1                             -0.535294   0.057190  -9.360  < 2e-16 ***
## offunif21                             -0.291377   0.063500  -4.589 4.46e-06 ***
## ns(age2, df = 2)1:cs_furtv1            0.154328   0.187419   0.823 0.410257    
## ns(age2, df = 2)2:cs_furtv1            0.050610   0.030288   1.671 0.094729 .  
## ns(age2, df = 2)1:cs_lkout1           -0.266411   0.281059  -0.948 0.343190    
## ns(age2, df = 2)2:cs_lkout1            0.001139   0.044856   0.025 0.979746    
## ns(age2, df = 2)1:daytime1             0.058915   0.109676   0.537 0.591150    
## ns(age2, df = 2)2:daytime1            -0.006084   0.018817  -0.323 0.746439    
## ns(age2, df = 2)1:inout21             -0.030500   0.130124  -0.234 0.814680    
## ns(age2, df = 2)2:inout21             -0.070912   0.022170  -3.199 0.001381 ** 
## ns(age2, df = 2)1:ac_incid1            0.336878   0.112156   3.004 0.002667 ** 
## ns(age2, df = 2)2:ac_incid1            0.046820   0.018778   2.493 0.012654 *  
## ns(age2, df = 2)1:offunif21            0.003367   0.124426   0.027 0.978410    
## ns(age2, df = 2)2:offunif21           -0.023409   0.020692  -1.131 0.257926    
## cs_furtv1:daytime1                    -0.032376   0.079574  -0.407 0.684107    
## cs_furtv1:inout21                     -0.009905   0.099383  -0.100 0.920607    
## cs_furtv1:ac_incid1                    0.268687   0.080100   3.354 0.000795 ***
## cs_furtv1:offunif21                    0.005100   0.087986   0.058 0.953781    
## cs_lkout1:daytime1                    -0.068767   0.113664  -0.605 0.545174    
## cs_lkout1:inout21                      0.184822   0.140299   1.317 0.187725    
## cs_lkout1:ac_incid1                    0.207309   0.121317   1.709 0.087484 .  
## cs_lkout1:offunif21                   -0.233937   0.119859  -1.952 0.050966 .  
## ns(age2, df = 2)1:cs_furtv1:daytime1  -0.036447   0.155870  -0.234 0.815120    
## ns(age2, df = 2)2:cs_furtv1:daytime1  -0.013405   0.025871  -0.518 0.604364    
## ns(age2, df = 2)1:cs_furtv1:inout21    0.262551   0.195227   1.345 0.178674    
## ns(age2, df = 2)2:cs_furtv1:inout21    0.032966   0.032473   1.015 0.310018    
## ns(age2, df = 2)1:cs_furtv1:ac_incid1  0.028586   0.156771   0.182 0.855314    
## ns(age2, df = 2)2:cs_furtv1:ac_incid1 -0.015991   0.025684  -0.623 0.533561    
## ns(age2, df = 2)1:cs_furtv1:offunif21 -0.129535   0.171827  -0.754 0.450929    
## ns(age2, df = 2)2:cs_furtv1:offunif21  0.051800   0.027822   1.862 0.062620 .  
## ns(age2, df = 2)1:cs_lkout1:daytime1   0.110776   0.222081   0.499 0.617916    
## ns(age2, df = 2)2:cs_lkout1:daytime1   0.002142   0.035894   0.060 0.952424    
## ns(age2, df = 2)1:cs_lkout1:inout21    0.148753   0.275568   0.540 0.589333    
## ns(age2, df = 2)2:cs_lkout1:inout21   -0.011218   0.045427  -0.247 0.804946    
## ns(age2, df = 2)1:cs_lkout1:ac_incid1  0.085850   0.237168   0.362 0.717366    
## ns(age2, df = 2)2:cs_lkout1:ac_incid1  0.018166   0.038240   0.475 0.634746    
## ns(age2, df = 2)1:cs_lkout1:offunif21  0.226683   0.233823   0.969 0.332313    
## ns(age2, df = 2)2:cs_lkout1:offunif21 -0.001238   0.037254  -0.033 0.973493    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1795133  on 1637790  degrees of freedom
## Residual deviance: 1761344  on 1637746  degrees of freedom
## AIC: 1761434
## 
## Number of Fisher Scoring iterations: 4

predFrame <- expand.grid(cs_furtv = factor(c(0,1)),
                         ac_incid = factor(c(0,1)),
                         daytime = factor(c(0,1)),
                         inout2 = factor(c(0,1)),
                         offunif2 = factor(c(0,1)),
                         cs_lkout = factor(c(0,1)),
                         age2 = 10:21)

predFrame$predicted_prob <- predict(int_model, type = "response", newdata = predFrame, interval = "confidence")

ggplot(predFrame, aes(x = age2, y = predicted_prob)) +
  facet_grid(cs_lkout + cs_furtv ~ ac_incid + daytime + inout2 + offunif2, label = label_both) +
  #geom_ribbon(aes(ymin = lwr, ymax = upr), fill = gray(0.85)) +
  geom_line(color = "cadetblue") + coord_cartesian(ylim = c(0.1,0.4))

#Residualplot of the final model

cv_data <- mice1 %>%
  mutate(bin_force = as.numeric(as.character(bin_force)))

final_model <- glm(bin_force ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial)

simDiag <- fortify(final_model)

ggplot(simDiag, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

set.seed(2023)

yNew1 <- simulate(final_model)[,1]
yNew2 <- simulate(final_model)[,1]
yNew3 <- simulate(final_model)[,1]
yNew4 <- simulate(final_model)[,1]


simGlmNew1 <- glm(yNew1 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simGlmNew2 <- glm(yNew2 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simGlmNew3 <- glm(yNew3 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))
simGlmNew4 <- glm(yNew4 ~ ns(age2, df = 2)*(cs_furtv + cs_lkout)*(daytime + inout2 + ac_incid + offunif2), data = cv_data, family = binomial(link = logit), control = glm.control(maxit = 1000))

simDiagNew1 <- fortify(simGlmNew1)
simDiagNew2 <- fortify(simGlmNew2)
simDiagNew3 <- fortify(simGlmNew3)
simDiagNew4 <- fortify(simGlmNew4)


p1 <- qplot(.fitted, .resid, data = simDiagNew1) +
  geom_smooth()
p2 <- qplot(.fitted, .resid, data = simDiagNew2) +
geom_smooth()
p3 <- qplot(.fitted, .resid, data = simDiagNew3) +
geom_smooth()
p4 <- qplot(.fitted, .resid, data = simDiagNew4) +
geom_smooth()


grid.arrange(p1,p2,p3,p4, ncol=2)

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Assignment Regression