#Libraries

library(interactions)
library(car)
library(jtools)
library(arm)
library(effects)
library(faraway)
library(mgcv)
library(sjPlot) 
library(sjmisc) 
library(lme4) 
library(interactions)
library(ggplot2)
library(ggeffects)
library(lme4)
library(sjPlot)
library(dplyr)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(lme4)
library(ggpubr)
library(car)
library(effects)
library(gplots)
library(plotrix)
library(lmerTest)
library(lattice)
library(kableExtra)
library(nlme)
library(grid)
library(lmerTest)
library(multcomp)
library(readxl)
library(dplyr)
library(car)
library(gridExtra)
library(phyloseq)
library(ggfortify)
library(cowplot)
library(factoextra)
library(FactoMineR)

#Poisson

df1$smoking.habit <- factor(df1$smoking.habit)
glm.poisson1 <- glm(deaths.from.lung.cancer ~ age + smoking.habit, offset = log(population), family=poisson(link=log), data = df1)
glm.poisson2 <- glm(deaths.from.lung.cancer ~ age + smoking.habit, offset = log(population), family=quasipoisson(link=log), data = df1)
glm.poisson3 <- glm(deaths.from.lung.cancer ~ age + smoking.habit + age:smoking.habit, offset = log(population), family=poisson(link=log), data = df1)

resid.ssq <- sum(residuals(glm.poisson1,type="pearson")^2)  ## sum of squares of Pearson resids
resid.df <- nrow(df1)-length(coef(glm.poisson1))        ## estimated resid df (N-p)
resid.ssq/resid.df  
## [1] 2.321771
#compare models
anova(glm.poisson2,glm.poisson3, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: deaths.from.lung.cancer ~ age + smoking.habit
## Model 2: deaths.from.lung.cancer ~ age + smoking.habit + age:smoking.habit
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        31     75.734                     
## 2        28     75.109  3  0.62593   0.8905
summary(glm.poisson1)
## 
## Call:
## glm(formula = deaths.from.lung.cancer ~ age + smoking.habit, 
##     family = poisson(link = log), data = df1, offset = log(population))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0918  -1.1673  -0.2755   0.7803   2.6364  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -6.036992   0.085126 -70.918  < 2e-16 ***
## age                          0.066601   0.001118  59.559  < 2e-16 ***
## smoking.habitcigarretteOnly  0.405019   0.037463  10.811  < 2e-16 ***
## smoking.habitcigarrettePlus  0.203426   0.035996   5.651 1.59e-08 ***
## smoking.habitno             -0.032927   0.046894  -0.702    0.483    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   75.734  on 31  degrees of freedom
## AIC: 325.76
## 
## Number of Fisher Scoring iterations: 4
summary(glm.poisson2)
## 
## Call:
## glm(formula = deaths.from.lung.cancer ~ age + smoking.habit, 
##     family = quasipoisson(link = log), data = df1, offset = log(population))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0918  -1.1673  -0.2755   0.7803   2.6364  
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.036992   0.129710 -46.542  < 2e-16 ***
## age                          0.066601   0.001704  39.087  < 2e-16 ***
## smoking.habitcigarretteOnly  0.405019   0.057084   7.095 5.69e-08 ***
## smoking.habitcigarrettePlus  0.203426   0.054849   3.709 0.000815 ***
## smoking.habitno             -0.032927   0.071454  -0.461 0.648151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 2.321771)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   75.734  on 31  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4
summary(glm.poisson3) 
## 
## Call:
## glm(formula = deaths.from.lung.cancer ~ age + smoking.habit + 
##     age:smoking.habit, family = poisson(link = log), data = df1, 
##     offset = log(population))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7944  -1.2472  -0.2761   0.7303   2.7841  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -6.0626209  0.2807291 -21.596   <2e-16 ***
## age                              0.0669626  0.0039341  17.021   <2e-16 ***
## smoking.habitcigarretteOnly      0.3614224  0.3040650   1.189    0.235    
## smoking.habitcigarrettePlus      0.2744951  0.2987242   0.919    0.358    
## smoking.habitno                  0.0495613  0.3730296   0.133    0.894    
## age:smoking.habitcigarretteOnly  0.0007739  0.0043653   0.177    0.859    
## age:smoking.habitcigarrettePlus -0.0010914  0.0042554  -0.256    0.798    
## age:smoking.habitno             -0.0012104  0.0053548  -0.226    0.821    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   75.109  on 28  degrees of freedom
## AIC: 331.13
## 
## Number of Fisher Scoring iterations: 4
tab_model(glm.poisson1,glm.poisson2,glm.poisson3,
          dv.labels = c("Poisson", "Quasi-Poisson", "Quasi-Poisson interaction","Quasi-Poisson interaction reduced"),
          pred.labels = c("Intercept", "Age", "Smoking", "Age:Smoking"
                          ),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")
## Length of `pred.labels` does not equal number of predictors, no labelling applied.
  Poisson Quasi-Poisson Quasi-Poisson interaction Quasi-Poisson interaction reduced
Coefficients Estimate std. Error p-value df Estimate std. Error p-value df Estimate std. Error p-value df
(Intercept) 0.00 0.09 <0.001 31.00 0.00 0.13 <0.001 31.00 0.00 0.28 <0.001 28.00
age 1.07 0.00 <0.001 31.00 1.07 0.00 <0.001 31.00 1.07 0.00 <0.001 28.00
smoking.habitcigarretteOnly 1.50 0.04 <0.001 31.00 1.50 0.06 <0.001 31.00 1.44 0.30 0.235 28.00
smoking.habitcigarrettePlus 1.23 0.04 <0.001 31.00 1.23 0.05 0.001 31.00 1.32 0.30 0.358 28.00
smoking.habitno 0.97 0.05 0.483 31.00 0.97 0.07 0.648 31.00 1.05 0.37 0.894 28.00
age:smoking.habitcigarretteOnly 1.00 0.00 0.859 28.00
age:smoking.habitcigarrettePlus 1.00 0.00 0.798 28.00
age:smoking.habitno 1.00 0.01 0.821 28.00
R2 Nagelkerke 1.000 1.000 1.000
export_summs(glm.poisson1,glm.poisson2,glm.poisson3, scale = TRUE)
## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.
## Registered S3 methods overwritten by 'broom.mixed':
##   method         from 
##   augment.lme    broom
##   augment.merMod broom
##   glance.lme     broom
##   glance.merMod  broom
##   glance.stanreg broom
##   tidy.brmsfit   broom
##   tidy.gamlss    broom
##   tidy.lme       broom
##   tidy.merMod    broom
##   tidy.rjags     broom
##   tidy.stanfit   broom
##   tidy.stanreg   broom
## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.
Model 1 Model 2 Model 3
(Intercept) -2.04 *** -2.04 *** -2.04 ***
(0.03)    (0.05)    (0.05)   
age 0.87 *** 0.87 *** 0.88 ***
(0.01)    (0.02)    (0.05)   
smoking.habitcigarretteOnly 0.41 *** 0.41 *** 0.41 ***
(0.04)    (0.06)    (0.06)   
smoking.habitcigarrettePlus 0.20 *** 0.20 *** 0.21 ***
(0.04)    (0.05)    (0.06)   
smoking.habitno -0.03     -0.03     -0.02    
(0.05)    (0.07)    (0.07)   
age:smoking.habitcigarretteOnly                 0.01    
                (0.06)   
age:smoking.habitcigarrettePlus                 -0.01    
                (0.06)   
age:smoking.habitno                 -0.02    
                (0.07)   
N 36        36        36       
AIC 325.76             331.13    
BIC 333.67             343.80    
Pseudo R2 1.00     1.00     1.00    
All continuous predictors are mean-centered and scaled by 1 standard deviation. *** p < 0.001; ** p < 0.01; * p < 0.05.
#Diagnostic plots
par(mfrow=c(2,2))
plot(glm.poisson2)

# extract coefficients from first model using 'coef()'
coef1 = coef(glm.poisson1)
coef2 = coef(glm.poisson2)
coef3 = coef(glm.poisson3)
# extract standard errors from first model using 'se.coef()'
se.coef1 = se.coef(glm.poisson1)
se.coef2 = se.coef(glm.poisson2)
se.coef3 = se.coef(glm.poisson3)
# use 'cbind()' to combine values into one dataframe
models.both <- cbind(coef1, se.coef1, coef2, se.coef2,coef3, se.coef3, exponent = exp(coef1))
## Warning in cbind(coef1, se.coef1, coef2, se.coef2, coef3, se.coef3, exponent =
## exp(coef1)): number of rows of result is not a multiple of vector length (arg 1)
# show dataframe
models.both
##                                       coef1    se.coef1       coef2    se.coef2
## (Intercept)                     -6.03699249 0.085126022 -6.03699249 0.129709585
## age                              0.06660122 0.001118244  0.06660122 0.001703908
## smoking.habitcigarretteOnly      0.40501915 0.037462865  0.40501915 0.057083517
## smoking.habitcigarrettePlus      0.20342645 0.035996098  0.20342645 0.054848552
## smoking.habitno                 -0.03292676 0.046893872 -0.03292676 0.071453882
## age:smoking.habitcigarretteOnly -6.03699249 0.085126022 -6.03699249 0.129709585
## age:smoking.habitcigarrettePlus  0.06660122 0.001118244  0.06660122 0.001703908
## age:smoking.habitno              0.40501915 0.037462865  0.40501915 0.057083517
##                                         coef3    se.coef3    exponent
## (Intercept)                     -6.0626208546 0.280729078 0.002388732
## age                              0.0669626334 0.003934071 1.068869147
## smoking.habitcigarretteOnly      0.3614224031 0.304064955 1.499331216
## smoking.habitcigarrettePlus      0.2744950902 0.298724172 1.225595009
## smoking.habitno                  0.0495612783 0.373029559 0.967609427
## age:smoking.habitcigarretteOnly  0.0007739244 0.004365325 0.002388732
## age:smoking.habitcigarrettePlus -0.0010913929 0.004255373 1.068869147
## age:smoking.habitno             -0.0012104216 0.005354802 1.499331216
# plot regression coefficients for poisson.model2 and poisson.model
plot_summs(glm.poisson1,glm.poisson2,glm.poisson3, scale = TRUE, inner_ci_level = .9)
## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.
## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.

#plot effects
effect_plot(glm.poisson2, pred = age, interval = TRUE, plot.points = TRUE, 
            main = ('Age effects on lung cancer'), ylab = ('Number of deaths'), xlab = ('Age'))
## Using data df1 from global environment. This could cause incorrect results
## if df1 has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Outcome is based on a total of 1 exposures

effect_plot(glm.poisson2, pred = smoking.habit, interval = TRUE, plot.points = TRUE, 
            main = ('Smoking effects on lung cancer'))
## Using data df1 from global environment. This could cause incorrect results
## if df1 has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Outcome is based on a total of 1 exposures

plot(allEffects(glm.poisson2),rescale.axis=F, ylab = ('Number of deaths'))

#GAM
gam.lung <- 
  gam(deaths.from.lung.cancer ~ s(age, k=9) + 
        s(smoking.habit, k=4), Family = poisson(link=log), data = df2)
summary(gam.lung)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## deaths.from.lung.cancer ~ s(age, k = 9) + s(smoking.habit, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   253.61      26.24   9.667 1.69e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df      F  p-value    
## s(age)           3.612  4.459  6.952 0.000278 ***
## s(smoking.habit) 2.880  2.989 11.113 3.15e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.641   Deviance explained = 70.7%
## GCV =  31290  Scale est. = 24778     n = 36
#Interactions between age and smoking on death lung cancer plots

interact_plot(gam.lung, pred=smoking.habit, modx=age, 
              main = ('Smoking Habit ~ Age interaction plot'), 
              ylab = ('Deaths from lung cancer'), xlab = ('Age'))
## Warning: smoking.habit and age are not included in an interaction with one another
## in the model.

interact_plot(gam.lung, pred=age, modx=smoking.habit, 
              main = ('Age ~ Smoking Habit interaction plot'), 
              ylab = ('Deaths from lung cancer'), xlab = ('Smoking Habit'))
## Warning: age and smoking.habit are not included in an interaction with one another
## in the model.

#Binomial

df3$noOther <- df3$wantsMore == "no"
df3$highEd <- df3$education == "high"

glm.bin <- glm( cbind(using, notUsing) ~
                 age + highEd + noOther , family = binomial, data =df3)
resid.ssqb <- sum(residuals(glm.bin,type="pearson")^2)  ## sum of squares of Pearson resids
resid.dfb <- nrow(df3)-length(coef(glm.bin))        ## estimated resid df (N-p)
resid.ssqb/resid.dfb 
## [1] 2.828834
glm.qbin <- glm( cbind(using, notUsing) ~
                  age + highEd + noOther , family = binomial, data =df3)

glm.qbin.inter <- glm( cbind(using,notUsing) ~ age*noOther + highEd , family=binomial, data=df3)

#summaries
summary(glm.bin)
## 
## Call:
## glm(formula = cbind(using, notUsing) ~ age + highEd + noOther, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5148  -0.9376   0.2408   0.9822   1.7333  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.9662     0.1720 -11.429  < 2e-16 ***
## age25-29      0.3894     0.1759   2.214  0.02681 *  
## age30-39      0.9086     0.1646   5.519 3.40e-08 ***
## age40-49      1.1892     0.2144   5.546 2.92e-08 ***
## highEdTRUE    0.3250     0.1240   2.620  0.00879 ** 
## noOtherTRUE   0.8330     0.1175   7.091 1.33e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 165.772  on 15  degrees of freedom
## Residual deviance:  29.917  on 10  degrees of freedom
## AIC: 113.43
## 
## Number of Fisher Scoring iterations: 4
summary(glm.qbin)
## 
## Call:
## glm(formula = cbind(using, notUsing) ~ age + highEd + noOther, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5148  -0.9376   0.2408   0.9822   1.7333  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.9662     0.1720 -11.429  < 2e-16 ***
## age25-29      0.3894     0.1759   2.214  0.02681 *  
## age30-39      0.9086     0.1646   5.519 3.40e-08 ***
## age40-49      1.1892     0.2144   5.546 2.92e-08 ***
## highEdTRUE    0.3250     0.1240   2.620  0.00879 ** 
## noOtherTRUE   0.8330     0.1175   7.091 1.33e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 165.772  on 15  degrees of freedom
## Residual deviance:  29.917  on 10  degrees of freedom
## AIC: 113.43
## 
## Number of Fisher Scoring iterations: 4
summary(glm.qbin.inter)
## 
## Call:
## glm(formula = cbind(using, notUsing) ~ age * noOther + highEd, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.30027  -0.66163  -0.03286   0.81945   1.73851  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.80317    0.18018 -10.008  < 2e-16 ***
## age25-29              0.39460    0.20145   1.959  0.05013 .  
## age30-39              0.54666    0.19842   2.755  0.00587 ** 
## age40-49              0.57952    0.34742   1.668  0.09530 .  
## noOtherTRUE           0.06622    0.33071   0.200  0.84130    
## highEdTRUE            0.34065    0.12577   2.709  0.00676 ** 
## age25-29:noOtherTRUE  0.25918    0.40975   0.633  0.52704    
## age30-39:noOtherTRUE  1.11266    0.37404   2.975  0.00293 ** 
## age40-49:noOtherTRUE  1.36167    0.48433   2.811  0.00493 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 165.77  on 15  degrees of freedom
## Residual deviance:  12.63  on  7  degrees of freedom
## AIC: 102.14
## 
## Number of Fisher Scoring iterations: 4
tab_model(glm.bin,glm.qbin.inter,
          dv.labels = c("Quasi-Binomial", "Age Interaction"),
          pred.labels = c("Intercept", "Age", "Smoking", "Age:Smoking"
          ),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")
## Length of `pred.labels` does not equal number of predictors, no labelling applied.
  Quasi-Binomial Age Interaction
Coefficients Estimate std. Error p-value df Estimate std. Error p-value df
(Intercept) 0.14 0.17 <0.001 10.00 0.16 0.18 <0.001 7.00
age25-29 1.48 0.18 0.027 10.00 1.48 0.20 0.050 7.00
age30-39 2.48 0.16 <0.001 10.00 1.73 0.20 0.006 7.00
age40-49 3.28 0.21 <0.001 10.00 1.79 0.35 0.095 7.00
highEdTRUE 1.38 0.12 0.009 10.00 1.41 0.13 0.007 7.00
noOtherTRUE 2.30 0.12 <0.001 10.00 1.07 0.33 0.841 7.00
age25-29:noOtherTRUE 1.30 0.41 0.527 7.00
age30-39:noOtherTRUE 3.04 0.37 0.003 7.00
age40-49:noOtherTRUE 3.90 0.48 0.005 7.00
R2 Tjur 0.025 0.035
#anovas
anova(glm.qbin, glm.qbin.inter, test = "Chisq")
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
10 29.9           
7 12.6 3 17.3 0.000617
anova(glm.qbin.inter, test = "Chisq")
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
    15 166          
3 79.2  12 86.6 4.58e-17
1 49.7  11 36.9 1.8e-12 
1 6.97 10 29.9 0.00829 
3 17.3  7 12.6 0.000617
#select model 
drop1(glm.qbin.inter, test = "Chisq")
Df Deviance AIC LRT Pr(>Chi)
12.6 102            
1 20.1 108 7.47 0.00628 
3 29.9 113 17.3  0.000617
add1(glm.qbin.inter, ~ .^2, test = "Chisq")
Df Deviance AIC LRT Pr(>Chi)
12.6 102          
3 5.8 101 6.83 0.0775
1 10.8 102 1.81 0.179 
search = step(glm.bin, ~.^2)
## Start:  AIC=113.43
## cbind(using, notUsing) ~ age + highEd + noOther
## 
##                  Df Deviance    AIC
## + age:noOther     3   12.630 102.14
## + highEd:noOther  1   23.017 108.53
## + age:highEd      3   23.151 112.66
## <none>                29.917 113.42
## - highEd          1   36.888 118.40
## - age             3   73.865 151.37
## - noOther         1   80.418 161.93
## 
## Step:  AIC=102.14
## cbind(using, notUsing) ~ age + highEd + noOther + age:noOther
## 
##                  Df Deviance    AIC
## + age:highEd      3   5.7983 101.31
## <none>               12.6296 102.14
## + highEd:noOther  1  10.8240 102.33
## - highEd          1  20.0990 107.61
## - age:noOther     3  29.9172 113.42
## 
## Step:  AIC=101.31
## cbind(using, notUsing) ~ age + highEd + noOther + age:noOther + 
##     age:highEd
## 
##                  Df Deviance     AIC
## + highEd:noOther  1   2.4415  99.949
## <none>                5.7983 101.306
## - age:highEd      3  12.6296 102.137
## - age:noOther     3  23.1511 112.659
## 
## Step:  AIC=99.95
## cbind(using, notUsing) ~ age + highEd + noOther + age:noOther + 
##     age:highEd + highEd:noOther
## 
##                  Df Deviance     AIC
## <none>                2.4415  99.949
## - highEd:noOther  1   5.7983 101.306
## - age:highEd      3  10.8240 102.332
## - age:noOther     3  13.7639 105.272
search$anova
Step Df Deviance Resid. Df Resid. Dev AIC
    10 29.9  113  
+ age:noOther -3 17.3  7 12.6  102  
+ age:highEd -3 6.83 4 5.8  101  
+ highEd:noOther -1 3.36 3 2.44 99.9
AICmodel <- glm(cbind(using, notUsing) ~ age + highEd + noOther + age:noOther + 
                  age:highEd + highEd:noOther, family = binomial, data=df3)
BIC(glm.qbin)
## [1] 118.0607
BIC(glm.qbin.inter)
## [1] 109.0908
BIC(AICmodel)
## [1] 109.993
#Diagnostic plots
par(mfrow=c(2,2))
plot(glm.qbin.inter)

#proportion p
(1/(1+1/(exp(-1.80317))))*100
## [1] 14.14656
(1/(1+1/(exp(-1.80317+1.52078+0.34+0.06+2.7335))))*100
## [1] 94.5376
#check valididty of model (?)
#the difference in deviance between the models
with(glm.qbin.inter, null.deviance - deviance)
## [1] 153.1428
#The degrees of freedom for the difference between the two models is equal to the number of predictor variables in the mode, and can be obtained using
with(glm.qbin.inter, df.null - df.residual)
## [1] 8
#p-value
with(glm.qbin.inter, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 4.331423e-29
plot(allEffects(glm.qbin.inter),rescale.axis=F, ylab = ('Contraceptive pill usage'))

#Mixed Model 1 - Rats

df6$Offspring <- as.factor(df6$Offspring)
df6$Sex <- as.factor(df6$Sex)
df6$Treatment <- as.factor(df6$Treatment)

hist(df6$Weight, main = '', xlab = 'Weight')

ggplot(df6, aes(Treatment, Weight)) +
  geom_boxplot(aes(color = Treatment))+
  facet_wrap(~Sex) +
  scale_color_manual(values = c("#00AFBB", "#E7B800","turquoise4")) +
  ylab('Weight') +
  xlab('')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggplot(df6, aes(Treatment, Weight)) +
  geom_boxplot(aes(color = Treatment))+
  facet_wrap(~Offspring.Size) +
  scale_color_manual(values = c("#00AFBB", "#E7B800","turquoise4")) +
  ylab('Weight') +
  xlab('')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

which(!complete.cases(df6$Weight))
## integer(0)
#models
model_1 <- lmer(Weight ~ Treatment + Sex + Offspring.Size + (1|Offspring),  data = df6, REML = T) #original
model_2 <- lmer(Weight ~ Treatment + Sex +(1|Offspring),  data = df6, REML = F) #test for size
model_3 <-lmer (Weight ~ Treatment + Offspring.Size + (1|Offspring),  data = df6, REML =F) #test for gender 
model_4 <- lmer(Weight ~ Sex + Offspring.Size + (1|Offspring) , data = df6, REML = T) # treatment effect
model_5 <- lmer(Weight ~ Treatment*Sex + Treatment*Offspring.Size + (1|Offspring),  data = df6, REML = T) #interaction of treatment with sex
summ(model_1)
Observations 322
Dependent variable Weight
Type Mixed effects linear regression
AIC 411.00
BIC 437.42
Pseudo-R² (fixed effects) 0.42
Pseudo-R² (total) 0.64
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 7.95 0.27 29.17 32.19 0.00
TreatmentHigh -0.86 0.18 -4.72 24.98 0.00
TreatmentLow -0.43 0.15 -2.85 22.90 0.01
SexMale 0.36 0.05 7.56 301.82 0.00
Offspring.Size -0.13 0.02 -6.86 31.67 0.00
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
Offspring (Intercept) 0.31
Residual 0.40
Grouping Variables
Group # groups ICC
Offspring 27 0.37
# HYPOTHESIS 1
rand(model_1) 
npar logLik AIC LRT Df Pr(>Chisq)
7 -198 411           
6 -244 499 90.5 1 1.87e-21
#testing fixed effect
Anova(model_1, type=2, test.statistic = "Chisq")  
Chisq Df Pr(>Chisq)
23.2 2 9.22e-06
57.2 1 3.97e-14
47.1 1 6.69e-12
anova(model_1,model_2) #testing size effect
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
6 423 446 -205 411          
7 393 419 -189 379 32.2 1 1.4e-08
anova(model_1,model_3) #testing gender effect
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
6 442 465 -215 430           
7 393 419 -189 379 51.3 1 8.08e-13
anova(model_1,model_4) #testing treatment effect
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
5 407 426 -199 397           
7 393 419 -189 379 18.7 2 8.72e-05
anova(model_5,model_1) #testing interaction effect
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
7 393 419 -189 379        
11 397 438 -187 375 4.1 4 0.392
Anova(model_5, type=2, test.statistic = "Chisq")
Chisq Df Pr(>Chisq)
24.2   2 5.58e-06
56.6   1 5.28e-14
48.2   1 3.78e-12
0.918 2 0.632   
2.66  2 0.265   
tab_model(model_1,model_2,model_3,model_4,model_5,
          dv.labels = c("Model 1", "Model 2","Model 3", "Model 4", "Model 5"),
          pred.labels = c("Intercept", "High Dose", "Low Dose", "Male",
                          "Offspring Size", "High Dose:Male", 
                          "Low Dose:Male", "High Dose:Offspring.Size", "Low Dose:Offspring.Size"),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")
  Model 1 Model 2 Model 3 Model 4 Model 5
Coefficients Estimate std. Error p-value df Estimate std. Error p-value df Estimate std. Error p-value df Estimate std. Error p-value df Estimate std. Error p-value df
Intercept 7.95 0.27 <0.001 315.00 6.24 0.18 <0.001 316.00 8.08 0.25 <0.001 316.00 7.21 0.29 <0.001 317.00 7.82 0.38 <0.001 311.00
High Dose -0.86 0.18 <0.001 315.00 -0.36 0.27 0.189 316.00 -0.88 0.16 <0.001 316.00 -0.14 0.57 0.808 311.00
Low Dose -0.43 0.15 0.004 315.00 -0.38 0.25 0.125 316.00 -0.47 0.13 <0.001 316.00 -0.60 0.57 0.292 311.00
Male 0.36 0.05 <0.001 315.00 0.36 0.05 <0.001 316.00 0.37 0.05 <0.001 317.00 0.41 0.07 <0.001 311.00
Offspring Size -0.13 0.02 <0.001 315.00 -0.12 0.02 <0.001 316.00 -0.10 0.02 <0.001 317.00 -0.12 0.03 <0.001 311.00
High Dose:Male -0.10 0.13 0.455 311.00
Low Dose:Male -0.09 0.11 0.401 311.00
High Dose:Offspring.Size -0.07 0.05 0.185 311.00
Low Dose:Offspring.Size 0.02 0.04 0.687 311.00
Random Effects
σ2 0.16 0.16 0.20 0.16 0.16
τ00 0.10 Offspring 0.28 Offspring 0.07 Offspring 0.19 Offspring 0.09 Offspring
N 27 Offspring 27 Offspring 27 Offspring 27 Offspring 27 Offspring
Marginal R2 / Conditional R2 0.419 / 0.637 0.138 / 0.686 0.371 / 0.531 0.254 / 0.659 0.436 / 0.640
#check residuals for homoscedasticity
plot(model_1, col="darkblue",  main = 'Residuals plot',
     ylab = 'Residuals of model 1', xlab = 'Fitted Values')

# check residuals for normality
par(mfrow=c(1,2))
qqnorm(resid(model_1))
qqline(resid(model_1))
hist(resid(model_1), main = 'Histogram of residuals', xlab = 'Residuals')

summary(glht(model_1, linfct=mcp(Treatment="Tukey")))
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lmer(formula = Weight ~ Treatment + Sex + Offspring.Size + (1 | 
##     Offspring), data = df6, REML = T)
## 
## Linear Hypotheses:
##                     Estimate Std. Error z value Pr(>|z|)    
## High - Control == 0  -0.8587     0.1818  -4.723   <0.001 ***
## Low - Control == 0   -0.4285     0.1504  -2.849   0.0119 *  
## Low - High == 0       0.4302     0.1804   2.385   0.0444 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
plot(allEffects(model_1))

#Mixed-model 2 - students

hist(sc$extro, main = 'Histogram of Extrovert Scores', xlab = 'Extrovert Scores')

ggplot(sc, aes(class, extro)) +
  geom_boxplot(aes(color = class))+
  facet_wrap(~school) +
  scale_color_manual(values = c("#FF3399", "#00AFBB", "#E7B800","turquoise4", "#B2182B", "navajowhite", "mediumvioletred","indianred1")) +
  ylab('Extrovert Scores') +
  xlab('')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggplot(sc, aes(open, extro)) +
  geom_boxplot(aes(color = class))+
  facet_wrap(~school) +
  scale_color_manual(values = c("#FF3399", "#00AFBB", "#E7B800","turquoise4", "#B2182B", "navajowhite", "mediumvioletred","indianred1")) +
  ylab('Extrovert Scores') +
  xlab('Openess')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

#models
model1 <- lmer(extro ~ open + agree + social + (1|class) + (1|school),  data = sc, REML = T) #original
model2 <- lmer(extro ~ open + agree + (1|class) + (1|school),  data = sc, REML = T) # check fro NAP per beach
model3 <- lmer(extro ~ open  + social + (1|class) + (1|school),  data = sc,  REML = T) #check for NAP
model4 <- lmer(extro ~ agree + social ++ (1|class) + (1|school),  data = sc, REML = T) #check for exposure
model5 <- lmer(extro ~ open * social + agree + (1|class) + (1|school),  data = sc, REML = T) #original
model6 <- lmer(extro ~ open * social + (1|class) + (1|school),  data = sc, REML = T) #original

summ(model1)
Observations 1200
Dependent variable extro
Type Mixed effects linear regression
AIC 4737.87
BIC 4773.50
Pseudo-R² (fixed effects) 0.00
Pseudo-R² (total) 0.97
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 60.20 4.21 14.29 6.08 0.00
open 0.01 0.01 1.30 1188.01 0.19
agree -0.01 0.01 -0.56 1188.03 0.57
social -0.00 0.00 -0.57 1188.02 0.57
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
school (Intercept) 9.79
class (Intercept) 2.41
Residual 1.67
Grouping Variables
Group # groups ICC
school 6 0.92
class 4 0.06
# HYPOTHESIS 1
rand(model1) 
npar logLik AIC LRT Df Pr(>Chisq)
7 -2.36e+03 4.74e+03                 
6 -2.91e+03 5.84e+03 1.1e+03  1 2.94e-241
6 -4.37e+03 8.74e+03 4.01e+03 1 0        
#testing fixed effect
Anova(model1, type=2, test.statistic = "Chisq")  
Chisq Df Pr(>Chisq)
1.68  1 0.194
0.318 1 0.573
0.322 1 0.571
anova(model1,model2) 
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
6 4.72e+03 4.75e+03 -2.35e+03 4.7e+03         
7 4.72e+03 4.75e+03 -2.35e+03 4.7e+03 0.322 1 0.57
anova(model1,model3) #
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
6 4.72e+03 4.75e+03 -2.35e+03 4.7e+03          
7 4.72e+03 4.75e+03 -2.35e+03 4.7e+03 0.319 1 0.572
anova(model1,model4) #testing
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
6 4.72e+03 4.75e+03 -2.35e+03 4.71e+03         
7 4.72e+03 4.75e+03 -2.35e+03 4.7e+03  1.69 1 0.194
anova(model1,model5) #testing 
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
7 4.72e+03 4.75e+03 -2.35e+03 4.7e+03         
8 4.71e+03 4.75e+03 -2.35e+03 4.7e+03 5.6 1 0.0179
anova(model1,model5) #testing treatment effect
## refitting model(s) with ML (instead of REML)
Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
7 4.72e+03 4.75e+03 -2.35e+03 4.7e+03         
8 4.71e+03 4.75e+03 -2.35e+03 4.7e+03 5.6 1 0.0179
Anova(model5, type=2, test.statistic = "Chisq") 
Chisq Df Pr(>Chisq)
1.69  1 0.194
0.323 1 0.57 
0.233 1 0.629
5.6   1 0.018
tab_model(model1,model2,model3,
          dv.labels = c("Model 1-A", "Model 2-A","Model 3-A"),
          pred.labels = c("Open", "Agree", "Social"),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")
## Length of `pred.labels` does not equal number of predictors, no labelling applied.
  Model 1-A Model 2-A Model 3-A
Coefficients Estimate std. Error p-value df Estimate std. Error p-value df Estimate std. Error p-value df
(Intercept) 60.20 4.21 <0.001 1193.00 60.02 4.20 <0.001 1194.00 60.01 4.20 <0.001 1194.00
open 0.01 0.01 0.194 1193.00 0.01 0.01 0.194 1194.00 0.01 0.01 0.196 1194.00
agree -0.01 0.01 0.573 1193.00 -0.01 0.01 0.570 1194.00
social -0.00 0.00 0.571 1193.00 -0.00 0.00 0.568 1194.00
Random Effects
σ2 2.79 2.79 2.79
τ00 95.91 school 95.90 school 95.88 school
5.79 class 5.79 class 5.78 class
N 4 class 4 class 4 class
6 school 6 school 6 school
Marginal R2 / Conditional R2 0.000 / 0.973 0.000 / 0.973 0.000 / 0.973
summ(model4)
Observations 1200
Dependent variable extro
Type Mixed effects linear regression
AIC 4729.82
BIC 4760.36
Pseudo-R² (fixed effects) 0.00
Pseudo-R² (total) 0.97
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 60.63 4.20 14.44 6.01 0.00
agree -0.01 0.01 -0.55 1189.03 0.58
social -0.00 0.00 -0.57 1189.02 0.57
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
school (Intercept) 9.79
class (Intercept) 2.41
Residual 1.67
Grouping Variables
Group # groups ICC
school 6 0.92
class 4 0.06
summ(model5)
Observations 1200
Dependent variable extro
Type Mixed effects linear regression
AIC 4747.55
BIC 4788.27
Pseudo-R² (fixed effects) 0.00
Pseudo-R² (total) 0.97
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 55.23 4.71 11.73 9.47 0.00
open 0.14 0.05 2.54 1187.00 0.01
social 0.05 0.02 2.26 1187.01 0.02
agree -0.00 0.01 -0.48 1187.03 0.63
open:social -0.00 0.00 -2.37 1187.01 0.02
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
school (Intercept) 9.79
class (Intercept) 2.41
Residual 1.67
Grouping Variables
Group # groups ICC
school 6 0.92
class 4 0.06
summ(model6)
Observations 1200
Dependent variable extro
Type Mixed effects linear regression
AIC 4738.32
BIC 4773.96
Pseudo-R² (fixed effects) 0.00
Pseudo-R² (total) 0.97
Fixed Effects
Est. S.E. t val. d.f. p
(Intercept) 55.04 4.69 11.74 9.33 0.00
open 0.14 0.05 2.56 1188.00 0.01
social 0.05 0.02 2.27 1188.01 0.02
open:social -0.00 0.00 -2.38 1188.01 0.02
p values calculated using Satterthwaite d.f.
Random Effects
Group Parameter Std. Dev.
school (Intercept) 9.79
class (Intercept) 2.41
Residual 1.67
Grouping Variables
Group # groups ICC
school 6 0.92
class 4 0.06
#check residuals for homoscedasticity
plot(model6, col="darkblue",  main = 'Residuals plot',
     ylab = 'Residuals of model 1', xlab = 'Fitted Values')

# check residuals for normality
par(mfrow=c(1,2))
qqnorm(resid(model6))
qqline(resid(model6))
hist(resid(model6), main = 'Histogram of residuals', xlab = 'Residuals')

sc$open <- factor(sc$open)

#PCA

PCA.model<-princomp(~CO.mg.m3+SO2.ug.m3+TSPug.m3+
                      NAP+ACF+
                      ACE+FLU+
                      PHE+ANTH+
                      FLA+PYR, cor=TRUE, data=df7, scores=TRUE)

#the number of PCs to show can be set with ncps.
fviz_screeplot(PCA.model, addlabels = TRUE, main = 'Importance of principal components', xlab = 'Component')

cep_pc <<- within(df7, {
  PC3 <- PCA.model$scores[,3]
  PC2 <- PCA.model$scores[,2]
  PC1 <- PCA.model$scores[,1] })

get_eig(PCA.model)
eigenvalue variance.percent cumulative.variance.percent
4.7    42.7   42.7
2.03   18.4   61.1
1.39   12.7   73.8
0.847  7.7   81.5
0.648  5.9   87.4
0.466  4.24  91.6
0.394  3.59  95.2
0.251  2.28  97.5
0.108  0.978 98.5
0.0855 0.777 99.3
0.0813 0.739 100  
p3 <- autoplot(PCA.model, data = df7, colour = 'Season', size = 3.5, 
               loadings.label.size = 5) 
 
p3 + theme_light() +  scale_size_continuous(range = c(1,3))+
  scale_colour_discrete("Season") #+

  geom_text(label=df7$Sample, vjust = -1, size=3)
## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
p2 <- autoplot(PCA.model, data = df7, colour = 'Phase',size = 3.5, loadings.label.size = 4) 
p2 + theme_light() + 
  scale_colour_discrete("Phase") #+

  geom_text(label=df7$Sample, vjust = -1, size=2)
## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
p1 <- autoplot(PCA.model, data = df7, colour = 'Wind.direction',  size = 3.5, loadings.label.size = 4) + 
  theme_light() + scale_colour_discrete("Wind.direction") 
p1 + theme_light() + 
  scale_colour_discrete("Wind.direction") #+
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

  geom_text(label=df7$Sample, vjust = -1, size=2)
## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
fviz_screeplot(PCA.model, addlabels = TRUE, main = 'Importance of principal components', xlab = 'Component')

fviz_contrib(PCA.model, choice = "var", axes = 1)

fviz_contrib(PCA.model, choice = "var", axes = 2)

fviz_pca_var(PCA.model, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE 
)

fviz_pca_biplot(PCA.model, label="var", habillage=df7$Phase, addEllipses=TRUE, ellipse.level=0.95, loadings.label.size = 4) 

fviz_pca_biplot(PCA.model, label="var", habillage=df7$Season, addEllipses=TRUE, ellipse.level=0.95, loadings.label.size = 4) 

p4<-ggplot(cep_pc) +
  aes(x = SO2.ug.m3, y = PC2, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('SO2(ug/m3)')+
  theme_light() +
  theme(legend.position ="none")
p6<-ggplot(cep_pc) +
  aes(x = CO.mg.m3, y = PC2, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('CO(mg/m3)')+
  theme_light() +
  theme(legend.position ="none")
p5<-ggplot(cep_pc) +
  aes(x = ANTH, y = PC1, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('Anthracene (ANTH) ')+
  theme_light() +
  theme(legend.position ="none")
p7<-ggplot(cep_pc) +
  aes(x = PHE, y = PC1, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('Phenanthrene (PHE)')+
  theme_light() +
  theme(legend.position ="none")
grid.arrange(p5,p7,p4,p6, ncol=2)