Final Project of the course: Statistical analysis and visualization in R: II, Södertörn University

#Libraries

library(interactions)
library(car)
library(jtools)
library(arm)
library(effects)
library(faraway)
library(mgcv)
library(sjPlot) 
library(sjmisc) 
library(lme4) 
library(interactions)
library(ggplot2)
library(ggeffects)
library(lme4)
library(sjPlot)
library(dplyr)
library(sjPlot)
library(sjmisc)
library(sjlabelled)
library(lme4)
library(ggpubr)
library(car)
library(effects)
library(gplots)
library(plotrix)
library(lmerTest)
library(lattice)
library(kableExtra)
library(nlme)
library(grid)
library(lmerTest)
library(multcomp)
library(readxl)
library(dplyr)
library(car)
library(gridExtra)
library(phyloseq)
library(ggfortify)
library(cowplot)
library(factoextra)
library(FactoMineR)

#Poisson

df1$smoking.habit <- factor(df1$smoking.habit)
glm.poisson1 <- glm(deaths.from.lung.cancer ~ age + smoking.habit, offset = log(population), family=poisson(link=log), data = df1)
glm.poisson2 <- glm(deaths.from.lung.cancer ~ age + smoking.habit, offset = log(population), family=quasipoisson(link=log), data = df1)
glm.poisson3 <- glm(deaths.from.lung.cancer ~ age + smoking.habit + age:smoking.habit, offset = log(population), family=poisson(link=log), data = df1)

resid.ssq <- sum(residuals(glm.poisson1,type="pearson")^2)  ## sum of squares of Pearson resids
resid.df <- nrow(df1)-length(coef(glm.poisson1))        ## estimated resid df (N-p)
resid.ssq/resid.df

## [1] 2.321771

#compare models
anova(glm.poisson2,glm.poisson3, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: deaths.from.lung.cancer ~ age + smoking.habit
## Model 2: deaths.from.lung.cancer ~ age + smoking.habit + age:smoking.habit
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        31     75.734                     
## 2        28     75.109  3  0.62593   0.8905

summary(glm.poisson1)

## 
## Call:
## glm(formula = deaths.from.lung.cancer ~ age + smoking.habit, 
##     family = poisson(link = log), data = df1, offset = log(population))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0918  -1.1673  -0.2755   0.7803   2.6364  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -6.036992   0.085126 -70.918  < 2e-16 ***
## age                          0.066601   0.001118  59.559  < 2e-16 ***
## smoking.habitcigarretteOnly  0.405019   0.037463  10.811  < 2e-16 ***
## smoking.habitcigarrettePlus  0.203426   0.035996   5.651 1.59e-08 ***
## smoking.habitno             -0.032927   0.046894  -0.702    0.483    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   75.734  on 31  degrees of freedom
## AIC: 325.76
## 
## Number of Fisher Scoring iterations: 4

summary(glm.poisson2)

## 
## Call:
## glm(formula = deaths.from.lung.cancer ~ age + smoking.habit, 
##     family = quasipoisson(link = log), data = df1, offset = log(population))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0918  -1.1673  -0.2755   0.7803   2.6364  
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.036992   0.129710 -46.542  < 2e-16 ***
## age                          0.066601   0.001704  39.087  < 2e-16 ***
## smoking.habitcigarretteOnly  0.405019   0.057084   7.095 5.69e-08 ***
## smoking.habitcigarrettePlus  0.203426   0.054849   3.709 0.000815 ***
## smoking.habitno             -0.032927   0.071454  -0.461 0.648151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 2.321771)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   75.734  on 31  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

summary(glm.poisson3)

## 
## Call:
## glm(formula = deaths.from.lung.cancer ~ age + smoking.habit + 
##     age:smoking.habit, family = poisson(link = log), data = df1, 
##     offset = log(population))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7944  -1.2472  -0.2761   0.7303   2.7841  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -6.0626209  0.2807291 -21.596   <2e-16 ***
## age                              0.0669626  0.0039341  17.021   <2e-16 ***
## smoking.habitcigarretteOnly      0.3614224  0.3040650   1.189    0.235    
## smoking.habitcigarrettePlus      0.2744951  0.2987242   0.919    0.358    
## smoking.habitno                  0.0495613  0.3730296   0.133    0.894    
## age:smoking.habitcigarretteOnly  0.0007739  0.0043653   0.177    0.859    
## age:smoking.habitcigarrettePlus -0.0010914  0.0042554  -0.256    0.798    
## age:smoking.habitno             -0.0012104  0.0053548  -0.226    0.821    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4055.984  on 35  degrees of freedom
## Residual deviance:   75.109  on 28  degrees of freedom
## AIC: 331.13
## 
## Number of Fisher Scoring iterations: 4

tab_model(glm.poisson1,glm.poisson2,glm.poisson3,
          dv.labels = c("Poisson", "Quasi-Poisson", "Quasi-Poisson interaction","Quasi-Poisson interaction reduced"),
          pred.labels = c("Intercept", "Age", "Smoking", "Age:Smoking"
                          ),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")

## Length of `pred.labels` does not equal number of predictors, no labelling applied.

	Poisson				Quasi-Poisson				Quasi-Poisson interaction
Coefficients	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df
(Intercept)	0.00	0.09	<0.001	31.00	0.00	0.13	<0.001	31.00	0.00	0.28	<0.001	28.00
age	1.07	0.00	<0.001	31.00	1.07	0.00	<0.001	31.00	1.07	0.00	<0.001	28.00
smoking.habitcigarretteOnly	1.50	0.04	<0.001	31.00	1.50	0.06	<0.001	31.00	1.44	0.30	0.235	28.00
smoking.habitcigarrettePlus	1.23	0.04	<0.001	31.00	1.23	0.05	0.001	31.00	1.32	0.30	0.358	28.00
smoking.habitno	0.97	0.05	0.483	31.00	0.97	0.07	0.648	31.00	1.05	0.37	0.894	28.00
age:smoking.habitcigarretteOnly									1.00	0.00	0.859	28.00
age:smoking.habitcigarrettePlus									1.00	0.00	0.798	28.00
age:smoking.habitno									1.00	0.01	0.821	28.00
R² Nagelkerke	1.000				1.000				1.000

export_summs(glm.poisson1,glm.poisson2,glm.poisson3, scale = TRUE)

## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.

## Registered S3 methods overwritten by 'broom.mixed':
##   method         from 
##   augment.lme    broom
##   augment.merMod broom
##   glance.lme     broom
##   glance.merMod  broom
##   glance.stanreg broom
##   tidy.brmsfit   broom
##   tidy.gamlss    broom
##   tidy.lme       broom
##   tidy.merMod    broom
##   tidy.rjags     broom
##   tidy.stanfit   broom
##   tidy.stanreg   broom

## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.

	Model 1	Model 2	Model 3
(Intercept)	-2.04 ***	-2.04 ***	-2.04 ***
	(0.03)	(0.05)	(0.05)
age	0.87 ***	0.87 ***	0.88 ***
	(0.01)	(0.02)	(0.05)
smoking.habitcigarretteOnly	0.41 ***	0.41 ***	0.41 ***
	(0.04)	(0.06)	(0.06)
smoking.habitcigarrettePlus	0.20 ***	0.20 ***	0.21 ***
	(0.04)	(0.05)	(0.06)
smoking.habitno	-0.03	-0.03	-0.02
	(0.05)	(0.07)	(0.07)
age:smoking.habitcigarretteOnly			0.01
			(0.06)
age:smoking.habitcigarrettePlus			-0.01
			(0.06)
age:smoking.habitno			-0.02
			(0.07)
N	36	36	36
AIC	325.76		331.13
BIC	333.67		343.80
Pseudo R2	1.00	1.00	1.00
All continuous predictors are mean-centered and scaled by 1 standard deviation. * p < 0.001; p < 0.01; * p < 0.05.

#Diagnostic plots
par(mfrow=c(2,2))
plot(glm.poisson2)

# extract coefficients from first model using 'coef()'
coef1 = coef(glm.poisson1)
coef2 = coef(glm.poisson2)
coef3 = coef(glm.poisson3)
# extract standard errors from first model using 'se.coef()'
se.coef1 = se.coef(glm.poisson1)
se.coef2 = se.coef(glm.poisson2)
se.coef3 = se.coef(glm.poisson3)
# use 'cbind()' to combine values into one dataframe
models.both <- cbind(coef1, se.coef1, coef2, se.coef2,coef3, se.coef3, exponent = exp(coef1))

## Warning in cbind(coef1, se.coef1, coef2, se.coef2, coef3, se.coef3, exponent =
## exp(coef1)): number of rows of result is not a multiple of vector length (arg 1)

# show dataframe
models.both

##                                       coef1    se.coef1       coef2    se.coef2
## (Intercept)                     -6.03699249 0.085126022 -6.03699249 0.129709585
## age                              0.06660122 0.001118244  0.06660122 0.001703908
## smoking.habitcigarretteOnly      0.40501915 0.037462865  0.40501915 0.057083517
## smoking.habitcigarrettePlus      0.20342645 0.035996098  0.20342645 0.054848552
## smoking.habitno                 -0.03292676 0.046893872 -0.03292676 0.071453882
## age:smoking.habitcigarretteOnly -6.03699249 0.085126022 -6.03699249 0.129709585
## age:smoking.habitcigarrettePlus  0.06660122 0.001118244  0.06660122 0.001703908
## age:smoking.habitno              0.40501915 0.037462865  0.40501915 0.057083517
##                                         coef3    se.coef3    exponent
## (Intercept)                     -6.0626208546 0.280729078 0.002388732
## age                              0.0669626334 0.003934071 1.068869147
## smoking.habitcigarretteOnly      0.3614224031 0.304064955 1.499331216
## smoking.habitcigarrettePlus      0.2744950902 0.298724172 1.225595009
## smoking.habitno                  0.0495612783 0.373029559 0.967609427
## age:smoking.habitcigarretteOnly  0.0007739244 0.004365325 0.002388732
## age:smoking.habitcigarrettePlus -0.0010913929 0.004255373 1.068869147
## age:smoking.habitno             -0.0012104216 0.005354802 1.499331216

# plot regression coefficients for poisson.model2 and poisson.model
plot_summs(glm.poisson1,glm.poisson2,glm.poisson3, scale = TRUE, inner_ci_level = .9)

## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.
## Note: Pseudo-R2 for quasibinomial/quasipoisson families is calculated by
## refitting the fitted and null models as binomial/poisson.

#plot effects
effect_plot(glm.poisson2, pred = age, interval = TRUE, plot.points = TRUE, 
            main = ('Age effects on lung cancer'), ylab = ('Number of deaths'), xlab = ('Age'))

## Using data df1 from global environment. This could cause incorrect results
## if df1 has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.

## Outcome is based on a total of 1 exposures

effect_plot(glm.poisson2, pred = smoking.habit, interval = TRUE, plot.points = TRUE, 
            main = ('Smoking effects on lung cancer'))

## Using data df1 from global environment. This could cause incorrect results
## if df1 has been altered since the model was fit. You can manually provide
## the data to the "data =" argument.
## Outcome is based on a total of 1 exposures

plot(allEffects(glm.poisson2),rescale.axis=F, ylab = ('Number of deaths'))

#GAM
gam.lung <- 
  gam(deaths.from.lung.cancer ~ s(age, k=9) + 
        s(smoking.habit, k=4), Family = poisson(link=log), data = df2)
summary(gam.lung)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## deaths.from.lung.cancer ~ s(age, k = 9) + s(smoking.habit, k = 4)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   253.61      26.24   9.667 1.69e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                    edf Ref.df      F  p-value    
## s(age)           3.612  4.459  6.952 0.000278 ***
## s(smoking.habit) 2.880  2.989 11.113 3.15e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.641   Deviance explained = 70.7%
## GCV =  31290  Scale est. = 24778     n = 36

#Interactions between age and smoking on death lung cancer plots

interact_plot(gam.lung, pred=smoking.habit, modx=age, 
              main = ('Smoking Habit ~ Age interaction plot'), 
              ylab = ('Deaths from lung cancer'), xlab = ('Age'))

## Warning: smoking.habit and age are not included in an interaction with one another
## in the model.

interact_plot(gam.lung, pred=age, modx=smoking.habit, 
              main = ('Age ~ Smoking Habit interaction plot'), 
              ylab = ('Deaths from lung cancer'), xlab = ('Smoking Habit'))

## Warning: age and smoking.habit are not included in an interaction with one another
## in the model.

#Binomial

df3$noOther <- df3$wantsMore == "no"
df3$highEd <- df3$education == "high"

glm.bin <- glm( cbind(using, notUsing) ~
                 age + highEd + noOther , family = binomial, data =df3)
resid.ssqb <- sum(residuals(glm.bin,type="pearson")^2)  ## sum of squares of Pearson resids
resid.dfb <- nrow(df3)-length(coef(glm.bin))        ## estimated resid df (N-p)
resid.ssqb/resid.dfb

## [1] 2.828834

glm.qbin <- glm( cbind(using, notUsing) ~
                  age + highEd + noOther , family = binomial, data =df3)

glm.qbin.inter <- glm( cbind(using,notUsing) ~ age*noOther + highEd , family=binomial, data=df3)

#summaries
summary(glm.bin)

## 
## Call:
## glm(formula = cbind(using, notUsing) ~ age + highEd + noOther, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5148  -0.9376   0.2408   0.9822   1.7333  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.9662     0.1720 -11.429  < 2e-16 ***
## age25-29      0.3894     0.1759   2.214  0.02681 *  
## age30-39      0.9086     0.1646   5.519 3.40e-08 ***
## age40-49      1.1892     0.2144   5.546 2.92e-08 ***
## highEdTRUE    0.3250     0.1240   2.620  0.00879 ** 
## noOtherTRUE   0.8330     0.1175   7.091 1.33e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 165.772  on 15  degrees of freedom
## Residual deviance:  29.917  on 10  degrees of freedom
## AIC: 113.43
## 
## Number of Fisher Scoring iterations: 4

summary(glm.qbin)

## 
## Call:
## glm(formula = cbind(using, notUsing) ~ age + highEd + noOther, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5148  -0.9376   0.2408   0.9822   1.7333  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.9662     0.1720 -11.429  < 2e-16 ***
## age25-29      0.3894     0.1759   2.214  0.02681 *  
## age30-39      0.9086     0.1646   5.519 3.40e-08 ***
## age40-49      1.1892     0.2144   5.546 2.92e-08 ***
## highEdTRUE    0.3250     0.1240   2.620  0.00879 ** 
## noOtherTRUE   0.8330     0.1175   7.091 1.33e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 165.772  on 15  degrees of freedom
## Residual deviance:  29.917  on 10  degrees of freedom
## AIC: 113.43
## 
## Number of Fisher Scoring iterations: 4

summary(glm.qbin.inter)

## 
## Call:
## glm(formula = cbind(using, notUsing) ~ age * noOther + highEd, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.30027  -0.66163  -0.03286   0.81945   1.73851  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.80317    0.18018 -10.008  < 2e-16 ***
## age25-29              0.39460    0.20145   1.959  0.05013 .  
## age30-39              0.54666    0.19842   2.755  0.00587 ** 
## age40-49              0.57952    0.34742   1.668  0.09530 .  
## noOtherTRUE           0.06622    0.33071   0.200  0.84130    
## highEdTRUE            0.34065    0.12577   2.709  0.00676 ** 
## age25-29:noOtherTRUE  0.25918    0.40975   0.633  0.52704    
## age30-39:noOtherTRUE  1.11266    0.37404   2.975  0.00293 ** 
## age40-49:noOtherTRUE  1.36167    0.48433   2.811  0.00493 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 165.77  on 15  degrees of freedom
## Residual deviance:  12.63  on  7  degrees of freedom
## AIC: 102.14
## 
## Number of Fisher Scoring iterations: 4

tab_model(glm.bin,glm.qbin.inter,
          dv.labels = c("Quasi-Binomial", "Age Interaction"),
          pred.labels = c("Intercept", "Age", "Smoking", "Age:Smoking"
          ),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")

## Length of `pred.labels` does not equal number of predictors, no labelling applied.

	Quasi-Binomial				Age Interaction
Coefficients	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df
(Intercept)	0.14	0.17	<0.001	10.00	0.16	0.18	<0.001	7.00
age25-29	1.48	0.18	0.027	10.00	1.48	0.20	0.050	7.00
age30-39	2.48	0.16	<0.001	10.00	1.73	0.20	0.006	7.00
age40-49	3.28	0.21	<0.001	10.00	1.79	0.35	0.095	7.00
highEdTRUE	1.38	0.12	0.009	10.00	1.41	0.13	0.007	7.00
noOtherTRUE	2.30	0.12	<0.001	10.00	1.07	0.33	0.841	7.00
age25-29:noOtherTRUE					1.30	0.41	0.527	7.00
age30-39:noOtherTRUE					3.04	0.37	0.003	7.00
age40-49:noOtherTRUE					3.90	0.48	0.005	7.00
R² Tjur	0.025				0.035

#anovas
anova(glm.qbin, glm.qbin.inter, test = "Chisq")

Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
10	29.9
7	12.6	3	17.3	0.000617

anova(glm.qbin.inter, test = "Chisq")

Df	Deviance	Resid. Df	Resid. Dev	Pr(>Chi)
		15	166
3	79.2	12	86.6	4.58e-17
1	49.7	11	36.9	1.8e-12
1	6.97	10	29.9	0.00829
3	17.3	7	12.6	0.000617

#select model 
drop1(glm.qbin.inter, test = "Chisq")

Df	Deviance	AIC	LRT	Pr(>Chi)
	12.6	102
1	20.1	108	7.47	0.00628
3	29.9	113	17.3	0.000617

add1(glm.qbin.inter, ~ .^2, test = "Chisq")

Df	Deviance	AIC	LRT	Pr(>Chi)
	12.6	102
3	5.8	101	6.83	0.0775
1	10.8	102	1.81	0.179

search = step(glm.bin, ~.^2)

## Start:  AIC=113.43
## cbind(using, notUsing) ~ age + highEd + noOther
## 
##                  Df Deviance    AIC
## + age:noOther     3   12.630 102.14
## + highEd:noOther  1   23.017 108.53
## + age:highEd      3   23.151 112.66
## <none>                29.917 113.42
## - highEd          1   36.888 118.40
## - age             3   73.865 151.37
## - noOther         1   80.418 161.93
## 
## Step:  AIC=102.14
## cbind(using, notUsing) ~ age + highEd + noOther + age:noOther
## 
##                  Df Deviance    AIC
## + age:highEd      3   5.7983 101.31
## <none>               12.6296 102.14
## + highEd:noOther  1  10.8240 102.33
## - highEd          1  20.0990 107.61
## - age:noOther     3  29.9172 113.42
## 
## Step:  AIC=101.31
## cbind(using, notUsing) ~ age + highEd + noOther + age:noOther + 
##     age:highEd
## 
##                  Df Deviance     AIC
## + highEd:noOther  1   2.4415  99.949
## <none>                5.7983 101.306
## - age:highEd      3  12.6296 102.137
## - age:noOther     3  23.1511 112.659
## 
## Step:  AIC=99.95
## cbind(using, notUsing) ~ age + highEd + noOther + age:noOther + 
##     age:highEd + highEd:noOther
## 
##                  Df Deviance     AIC
## <none>                2.4415  99.949
## - highEd:noOther  1   5.7983 101.306
## - age:highEd      3  10.8240 102.332
## - age:noOther     3  13.7639 105.272

search$anova

Step	Df	Deviance	Resid. Df	Resid. Dev	AIC
			10	29.9	113
+ age:noOther	-3	17.3	7	12.6	102
+ age:highEd	-3	6.83	4	5.8	101
+ highEd:noOther	-1	3.36	3	2.44	99.9

AICmodel <- glm(cbind(using, notUsing) ~ age + highEd + noOther + age:noOther + 
                  age:highEd + highEd:noOther, family = binomial, data=df3)
BIC(glm.qbin)

## [1] 118.0607

BIC(glm.qbin.inter)

## [1] 109.0908

BIC(AICmodel)

## [1] 109.993

#Diagnostic plots
par(mfrow=c(2,2))
plot(glm.qbin.inter)

#proportion p
(1/(1+1/(exp(-1.80317))))*100

## [1] 14.14656

(1/(1+1/(exp(-1.80317+1.52078+0.34+0.06+2.7335))))*100

## [1] 94.5376

#check valididty of model (?)
#the difference in deviance between the models
with(glm.qbin.inter, null.deviance - deviance)

## [1] 153.1428

#The degrees of freedom for the difference between the two models is equal to the number of predictor variables in the mode, and can be obtained using
with(glm.qbin.inter, df.null - df.residual)

## [1] 8

#p-value
with(glm.qbin.inter, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))

## [1] 4.331423e-29

plot(allEffects(glm.qbin.inter),rescale.axis=F, ylab = ('Contraceptive pill usage'))

#Mixed Model 1 - Rats

df6$Offspring <- as.factor(df6$Offspring)
df6$Sex <- as.factor(df6$Sex)
df6$Treatment <- as.factor(df6$Treatment)

hist(df6$Weight, main = '', xlab = 'Weight')

ggplot(df6, aes(Treatment, Weight)) +
  geom_boxplot(aes(color = Treatment))+
  facet_wrap(~Sex) +
  scale_color_manual(values = c("#00AFBB", "#E7B800","turquoise4")) +
  ylab('Weight') +
  xlab('')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggplot(df6, aes(Treatment, Weight)) +
  geom_boxplot(aes(color = Treatment))+
  facet_wrap(~Offspring.Size) +
  scale_color_manual(values = c("#00AFBB", "#E7B800","turquoise4")) +
  ylab('Weight') +
  xlab('')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

which(!complete.cases(df6$Weight))

## integer(0)

#models
model_1 <- lmer(Weight ~ Treatment + Sex + Offspring.Size + (1|Offspring),  data = df6, REML = T) #original
model_2 <- lmer(Weight ~ Treatment + Sex +(1|Offspring),  data = df6, REML = F) #test for size
model_3 <-lmer (Weight ~ Treatment + Offspring.Size + (1|Offspring),  data = df6, REML =F) #test for gender 
model_4 <- lmer(Weight ~ Sex + Offspring.Size + (1|Offspring) , data = df6, REML = T) # treatment effect
model_5 <- lmer(Weight ~ Treatment*Sex + Treatment*Offspring.Size + (1|Offspring),  data = df6, REML = T) #interaction of treatment with sex
summ(model_1)

Observations	322
Dependent variable	Weight
Type	Mixed effects linear regression

AIC	411.00
BIC	437.42
Pseudo-R² (fixed effects)	0.42
Pseudo-R² (total)	0.64

Fixed Effects
	Est.	S.E.	t val.	d.f.	p
(Intercept)	7.95	0.27	29.17	32.19	0.00
TreatmentHigh	-0.86	0.18	-4.72	24.98	0.00
TreatmentLow	-0.43	0.15	-2.85	22.90	0.01
SexMale	0.36	0.05	7.56	301.82	0.00
Offspring.Size	-0.13	0.02	-6.86	31.67	0.00
p values calculated using Satterthwaite d.f.

Random Effects
Group	Parameter	Std. Dev.
Offspring	(Intercept)	0.31
Residual		0.40

Grouping Variables
Group	# groups	ICC
Offspring	27	0.37

# HYPOTHESIS 1
rand(model_1)

npar	logLik	AIC	LRT	Df	Pr(>Chisq)
7	-198	411
6	-244	499	90.5	1	1.87e-21

#testing fixed effect
Anova(model_1, type=2, test.statistic = "Chisq")

Chisq	Df	Pr(>Chisq)
23.2	2	9.22e-06
57.2	1	3.97e-14
47.1	1	6.69e-12

anova(model_1,model_2) #testing size effect

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
6	423	446	-205	411
7	393	419	-189	379	32.2	1	1.4e-08

anova(model_1,model_3) #testing gender effect

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
6	442	465	-215	430
7	393	419	-189	379	51.3	1	8.08e-13

anova(model_1,model_4) #testing treatment effect

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
5	407	426	-199	397
7	393	419	-189	379	18.7	2	8.72e-05

anova(model_5,model_1) #testing interaction effect

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
7	393	419	-189	379
11	397	438	-187	375	4.1	4	0.392

Anova(model_5, type=2, test.statistic = "Chisq")

Chisq	Df	Pr(>Chisq)
24.2	2	5.58e-06
56.6	1	5.28e-14
48.2	1	3.78e-12
0.918	2	0.632
2.66	2	0.265

tab_model(model_1,model_2,model_3,model_4,model_5,
          dv.labels = c("Model 1", "Model 2","Model 3", "Model 4", "Model 5"),
          pred.labels = c("Intercept", "High Dose", "Low Dose", "Male",
                          "Offspring Size", "High Dose:Male", 
                          "Low Dose:Male", "High Dose:Offspring.Size", "Low Dose:Offspring.Size"),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")

	Model 1				Model 2				Model 3				Model 4				Model 5
Coefficients	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df
Intercept	7.95	0.27	<0.001	315.00	6.24	0.18	<0.001	316.00	8.08	0.25	<0.001	316.00	7.21	0.29	<0.001	317.00	7.82	0.38	<0.001	311.00
High Dose	-0.86	0.18	<0.001	315.00	-0.36	0.27	0.189	316.00	-0.88	0.16	<0.001	316.00					-0.14	0.57	0.808	311.00
Low Dose	-0.43	0.15	0.004	315.00	-0.38	0.25	0.125	316.00	-0.47	0.13	<0.001	316.00					-0.60	0.57	0.292	311.00
Male	0.36	0.05	<0.001	315.00	0.36	0.05	<0.001	316.00					0.37	0.05	<0.001	317.00	0.41	0.07	<0.001	311.00
Offspring Size	-0.13	0.02	<0.001	315.00					-0.12	0.02	<0.001	316.00	-0.10	0.02	<0.001	317.00	-0.12	0.03	<0.001	311.00
High Dose:Male																	-0.10	0.13	0.455	311.00
Low Dose:Male																	-0.09	0.11	0.401	311.00
High Dose:Offspring.Size																	-0.07	0.05	0.185	311.00
Low Dose:Offspring.Size																	0.02	0.04	0.687	311.00
Random Effects
σ²	0.16				0.16				0.20				0.16				0.16
τ₀₀	0.10 _Offspring				0.28 _Offspring				0.07 _Offspring				0.19 _Offspring				0.09 _Offspring
N	27 _Offspring				27 _Offspring				27 _Offspring				27 _Offspring				27 _Offspring
Marginal R² / Conditional R²	0.419 / 0.637				0.138 / 0.686				0.371 / 0.531				0.254 / 0.659				0.436 / 0.640

#check residuals for homoscedasticity
plot(model_1, col="darkblue",  main = 'Residuals plot',
     ylab = 'Residuals of model 1', xlab = 'Fitted Values')

# check residuals for normality
par(mfrow=c(1,2))
qqnorm(resid(model_1))
qqline(resid(model_1))
hist(resid(model_1), main = 'Histogram of residuals', xlab = 'Residuals')

summary(glht(model_1, linfct=mcp(Treatment="Tukey")))

## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Multiple Comparisons of Means: Tukey Contrasts
## 
## 
## Fit: lmer(formula = Weight ~ Treatment + Sex + Offspring.Size + (1 | 
##     Offspring), data = df6, REML = T)
## 
## Linear Hypotheses:
##                     Estimate Std. Error z value Pr(>|z|)    
## High - Control == 0  -0.8587     0.1818  -4.723   <0.001 ***
## Low - Control == 0   -0.4285     0.1504  -2.849   0.0119 *  
## Low - High == 0       0.4302     0.1804   2.385   0.0444 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

plot(allEffects(model_1))

#Mixed-model 2 - students

hist(sc$extro, main = 'Histogram of Extrovert Scores', xlab = 'Extrovert Scores')

ggplot(sc, aes(class, extro)) +
  geom_boxplot(aes(color = class))+
  facet_wrap(~school) +
  scale_color_manual(values = c("#FF3399", "#00AFBB", "#E7B800","turquoise4", "#B2182B", "navajowhite", "mediumvioletred","indianred1")) +
  ylab('Extrovert Scores') +
  xlab('')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggplot(sc, aes(open, extro)) +
  geom_boxplot(aes(color = class))+
  facet_wrap(~school) +
  scale_color_manual(values = c("#FF3399", "#00AFBB", "#E7B800","turquoise4", "#B2182B", "navajowhite", "mediumvioletred","indianred1")) +
  ylab('Extrovert Scores') +
  xlab('Openess')+
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

#models
model1 <- lmer(extro ~ open + agree + social + (1|class) + (1|school),  data = sc, REML = T) #original
model2 <- lmer(extro ~ open + agree + (1|class) + (1|school),  data = sc, REML = T) # check fro NAP per beach
model3 <- lmer(extro ~ open  + social + (1|class) + (1|school),  data = sc,  REML = T) #check for NAP
model4 <- lmer(extro ~ agree + social ++ (1|class) + (1|school),  data = sc, REML = T) #check for exposure
model5 <- lmer(extro ~ open * social + agree + (1|class) + (1|school),  data = sc, REML = T) #original
model6 <- lmer(extro ~ open * social + (1|class) + (1|school),  data = sc, REML = T) #original

summ(model1)

Observations	1200
Dependent variable	extro
Type	Mixed effects linear regression

AIC	4737.87
BIC	4773.50
Pseudo-R² (fixed effects)	0.00
Pseudo-R² (total)	0.97

Fixed Effects
	Est.	S.E.	t val.	d.f.	p
(Intercept)	60.20	4.21	14.29	6.08	0.00
open	0.01	0.01	1.30	1188.01	0.19
agree	-0.01	0.01	-0.56	1188.03	0.57
social	-0.00	0.00	-0.57	1188.02	0.57
p values calculated using Satterthwaite d.f.

Random Effects
Group	Parameter	Std. Dev.
school	(Intercept)	9.79
class	(Intercept)	2.41
Residual		1.67

Grouping Variables
Group	# groups	ICC
school	6	0.92
class	4	0.06

# HYPOTHESIS 1
rand(model1)

npar	logLik	AIC	LRT	Df	Pr(>Chisq)
7	-2.36e+03	4.74e+03
6	-2.91e+03	5.84e+03	1.1e+03	1	2.94e-241
6	-4.37e+03	8.74e+03	4.01e+03	1	0

#testing fixed effect
Anova(model1, type=2, test.statistic = "Chisq")

Chisq	Df	Pr(>Chisq)
1.68	1	0.194
0.318	1	0.573
0.322	1	0.571

anova(model1,model2)

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
6	4.72e+03	4.75e+03	-2.35e+03	4.7e+03
7	4.72e+03	4.75e+03	-2.35e+03	4.7e+03	0.322	1	0.57

anova(model1,model3) #

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
6	4.72e+03	4.75e+03	-2.35e+03	4.7e+03
7	4.72e+03	4.75e+03	-2.35e+03	4.7e+03	0.319	1	0.572

anova(model1,model4) #testing

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
6	4.72e+03	4.75e+03	-2.35e+03	4.71e+03
7	4.72e+03	4.75e+03	-2.35e+03	4.7e+03	1.69	1	0.194

anova(model1,model5) #testing

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
7	4.72e+03	4.75e+03	-2.35e+03	4.7e+03
8	4.71e+03	4.75e+03	-2.35e+03	4.7e+03	5.6	1	0.0179

anova(model1,model5) #testing treatment effect

## refitting model(s) with ML (instead of REML)

Df	AIC	BIC	logLik	deviance	Chisq	Chi Df	Pr(>Chisq)
7	4.72e+03	4.75e+03	-2.35e+03	4.7e+03
8	4.71e+03	4.75e+03	-2.35e+03	4.7e+03	5.6	1	0.0179

Anova(model5, type=2, test.statistic = "Chisq")

Chisq	Df	Pr(>Chisq)
1.69	1	0.194
0.323	1	0.57
0.233	1	0.629
5.6	1	0.018

tab_model(model1,model2,model3,
          dv.labels = c("Model 1-A", "Model 2-A","Model 3-A"),
          pred.labels = c("Open", "Agree", "Social"),
          show.se = TRUE, show.icc = FALSE, 
          show.ci = FALSE, show.df = TRUE, 
          show.obs = FALSE,
          string.est = "Estimate",
          string.p = "p-value",
          string.resp = "Response",
          string.pred = "Coefficients",
          string.df = "df")

## Length of `pred.labels` does not equal number of predictors, no labelling applied.

	Model 1-A				Model 2-A				Model 3-A
Coefficients	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df	Estimate	std. Error	p-value	df
(Intercept)	60.20	4.21	<0.001	1193.00	60.02	4.20	<0.001	1194.00	60.01	4.20	<0.001	1194.00
open	0.01	0.01	0.194	1193.00	0.01	0.01	0.194	1194.00	0.01	0.01	0.196	1194.00
agree	-0.01	0.01	0.573	1193.00	-0.01	0.01	0.570	1194.00
social	-0.00	0.00	0.571	1193.00					-0.00	0.00	0.568	1194.00
Random Effects
σ²	2.79				2.79				2.79
τ₀₀	95.91 _school				95.90 _school				95.88 _school
	5.79 _class				5.79 _class				5.78 _class
N	4 _class				4 _class				4 _class
	6 _school				6 _school				6 _school
Marginal R² / Conditional R²	0.000 / 0.973				0.000 / 0.973				0.000 / 0.973

summ(model4)

Observations	1200
Dependent variable	extro
Type	Mixed effects linear regression

AIC	4729.82
BIC	4760.36
Pseudo-R² (fixed effects)	0.00
Pseudo-R² (total)	0.97

Fixed Effects
	Est.	S.E.	t val.	d.f.	p
(Intercept)	60.63	4.20	14.44	6.01	0.00
agree	-0.01	0.01	-0.55	1189.03	0.58
social	-0.00	0.00	-0.57	1189.02	0.57
p values calculated using Satterthwaite d.f.

Random Effects
Group	Parameter	Std. Dev.
school	(Intercept)	9.79
class	(Intercept)	2.41
Residual		1.67

Grouping Variables
Group	# groups	ICC
school	6	0.92
class	4	0.06

summ(model5)

Observations	1200
Dependent variable	extro
Type	Mixed effects linear regression

AIC	4747.55
BIC	4788.27
Pseudo-R² (fixed effects)	0.00
Pseudo-R² (total)	0.97

Fixed Effects
	Est.	S.E.	t val.	d.f.	p
(Intercept)	55.23	4.71	11.73	9.47	0.00
open	0.14	0.05	2.54	1187.00	0.01
social	0.05	0.02	2.26	1187.01	0.02
agree	-0.00	0.01	-0.48	1187.03	0.63
open:social	-0.00	0.00	-2.37	1187.01	0.02
p values calculated using Satterthwaite d.f.

Random Effects
Group	Parameter	Std. Dev.
school	(Intercept)	9.79
class	(Intercept)	2.41
Residual		1.67

Grouping Variables
Group	# groups	ICC
school	6	0.92
class	4	0.06

summ(model6)

Observations	1200
Dependent variable	extro
Type	Mixed effects linear regression

AIC	4738.32
BIC	4773.96
Pseudo-R² (fixed effects)	0.00
Pseudo-R² (total)	0.97

Fixed Effects
	Est.	S.E.	t val.	d.f.	p
(Intercept)	55.04	4.69	11.74	9.33	0.00
open	0.14	0.05	2.56	1188.00	0.01
social	0.05	0.02	2.27	1188.01	0.02
open:social	-0.00	0.00	-2.38	1188.01	0.02
p values calculated using Satterthwaite d.f.

Random Effects
Group	Parameter	Std. Dev.
school	(Intercept)	9.79
class	(Intercept)	2.41
Residual		1.67

Grouping Variables
Group	# groups	ICC
school	6	0.92
class	4	0.06

#check residuals for homoscedasticity
plot(model6, col="darkblue",  main = 'Residuals plot',
     ylab = 'Residuals of model 1', xlab = 'Fitted Values')

# check residuals for normality
par(mfrow=c(1,2))
qqnorm(resid(model6))
qqline(resid(model6))
hist(resid(model6), main = 'Histogram of residuals', xlab = 'Residuals')

sc$open <- factor(sc$open)

#PCA

PCA.model<-princomp(~CO.mg.m3+SO2.ug.m3+TSPug.m3+
                      NAP+ACF+
                      ACE+FLU+
                      PHE+ANTH+
                      FLA+PYR, cor=TRUE, data=df7, scores=TRUE)

#the number of PCs to show can be set with ncps.
fviz_screeplot(PCA.model, addlabels = TRUE, main = 'Importance of principal components', xlab = 'Component')

cep_pc <<- within(df7, {
  PC3 <- PCA.model$scores[,3]
  PC2 <- PCA.model$scores[,2]
  PC1 <- PCA.model$scores[,1] })

get_eig(PCA.model)

eigenvalue	variance.percent	cumulative.variance.percent
4.7	42.7	42.7
2.03	18.4	61.1
1.39	12.7	73.8
0.847	7.7	81.5
0.648	5.9	87.4
0.466	4.24	91.6
0.394	3.59	95.2
0.251	2.28	97.5
0.108	0.978	98.5
0.0855	0.777	99.3
0.0813	0.739	100

p3 <- autoplot(PCA.model, data = df7, colour = 'Season', size = 3.5, 
               loadings.label.size = 5) 
 
p3 + theme_light() +  scale_size_continuous(range = c(1,3))+
  scale_colour_discrete("Season") #+

  geom_text(label=df7$Sample, vjust = -1, size=3)

## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

p2 <- autoplot(PCA.model, data = df7, colour = 'Phase',size = 3.5, loadings.label.size = 4) 
p2 + theme_light() + 
  scale_colour_discrete("Phase") #+

  geom_text(label=df7$Sample, vjust = -1, size=2)

## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

p1 <- autoplot(PCA.model, data = df7, colour = 'Wind.direction',  size = 3.5, loadings.label.size = 4) + 
  theme_light() + scale_colour_discrete("Wind.direction") 
p1 + theme_light() + 
  scale_colour_discrete("Wind.direction") #+

## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

  geom_text(label=df7$Sample, vjust = -1, size=2)

## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

fviz_screeplot(PCA.model, addlabels = TRUE, main = 'Importance of principal components', xlab = 'Component')

fviz_contrib(PCA.model, choice = "var", axes = 1)

fviz_contrib(PCA.model, choice = "var", axes = 2)

fviz_pca_var(PCA.model, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE 
)

fviz_pca_biplot(PCA.model, label="var", habillage=df7$Phase, addEllipses=TRUE, ellipse.level=0.95, loadings.label.size = 4)

fviz_pca_biplot(PCA.model, label="var", habillage=df7$Season, addEllipses=TRUE, ellipse.level=0.95, loadings.label.size = 4)

p4<-ggplot(cep_pc) +
  aes(x = SO2.ug.m3, y = PC2, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('SO2(ug/m3)')+
  theme_light() +
  theme(legend.position ="none")
p6<-ggplot(cep_pc) +
  aes(x = CO.mg.m3, y = PC2, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('CO(mg/m3)')+
  theme_light() +
  theme(legend.position ="none")
p5<-ggplot(cep_pc) +
  aes(x = ANTH, y = PC1, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('Anthracene (ANTH) ')+
  theme_light() +
  theme(legend.position ="none")
p7<-ggplot(cep_pc) +
  aes(x = PHE, y = PC1, colour = Season) +
  geom_point(size = 2L) +
  scale_color_hue() +
  xlab('Phenanthrene (PHE)')+
  theme_light() +
  theme(legend.position ="none")
grid.arrange(p5,p7,p4,p6, ncol=2)

Final Project of the course: Statistical analysis and visualization in R: II, Södertörn University

Theodorou Sofia Emmanouela

5/19/2020