Semiparametric mixed models

Q2 BMI

##Fit an appropriate model to describe the relationship between age 
 #and bmi in the body mass index data
require(gamlss)

## Loading required package: gamlss

## Warning: package 'gamlss' was built under R version 3.4.3

## Loading required package: splines

## Loading required package: gamlss.data

## Loading required package: gamlss.dist

## Warning: package 'gamlss.dist' was built under R version 3.4.3

## Loading required package: MASS

## Loading required package: nlme

## Loading required package: parallel

##  **********   GAMLSS Version 5.0-5  **********

## For more on GAMLSS look at http://www.gamlss.org/

## Type gamlssNews() to see new features/changes/bug fixes.

data("dbbmi", package = "gamlss")

## Warning in data("dbbmi", package = "gamlss"): data set 'dbbmi' not found

str(dbbmi)

## 'data.frame':    7294 obs. of  2 variables:
##  $ age: num  0.03 0.04 0.04 0.04 0.04 0.05 0.05 0.05 0.05 0.05 ...
##  $ bmi: num  13.2 12.4 14.5 11.8 15.3 ...

head(dbbmi)

##    age      bmi
## 1 0.03 13.23529
## 2 0.04 12.43877
## 3 0.04 14.54177
## 4 0.04 11.77395
## 5 0.04 15.32561
## 6 0.05 13.21439

require(ggplot2)

## Loading required package: ggplot2

plot(dbbmi, pch=20,cex=.1)

# simple regression model
m0 <- lm(bmi ~ age, data = dbbmi)

# piecewise regression to choose knots
# psi設定要找的區間
require(segmented)

## Loading required package: segmented

## Warning: package 'segmented' was built under R version 3.4.3

segmented(m0, seg.Z = ~ age, psi = c(3, 20), data =dbbmi)

## Call: segmented.lm(obj = m0, seg.Z = ~age, psi = c(3, 20), data = dbbmi)
## 
## Meaningful coefficients of the linear terms:
## (Intercept)          age       U1.age       U2.age  
##     13.9078       8.9143      -9.3317       0.9494  
## 
## Estimated Break-Point(s):
## psi1.age  psi2.age  
##   0.3883    6.2926

# set up a linear spline at 8
a8 <- (dbbmi$age - 8)
a8[a8 < 0] <- 0

# set up a linear spline at 18
a18 <- (dbbmi$age - 18)
a18[a18 < 0] <- 0

# the linear spline basis functions
# fit the linear spline
summary(m1 <- lm(bmi ~ age + a8 + a18, data = dbbmi ))

## 
## Call:
## lm(formula = bmi ~ age + a8 + a18, data = dbbmi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9781 -1.3752 -0.2268  1.0825 15.1185 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.56039    0.05103 324.517  < 2e-16 ***
## age         -0.11548    0.01178  -9.803  < 2e-16 ***
## a8           0.68208    0.02121  32.157  < 2e-16 ***
## a18         -0.24916    0.06764  -3.683 0.000232 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.128 on 7290 degrees of freedom
## Multiple R-squared:  0.4646, Adjusted R-squared:  0.4644 
## F-statistic:  2109 on 3 and 7290 DF,  p-value: < 2.2e-16

# fit the natural splines model at two knots
require(splines)
summary(m2 <- lm(bmi ~ ns(age, knots = c(8, 18)), data = dbbmi))

## 
## Call:
## lm(formula = bmi ~ ns(age, knots = c(8, 18)), data = dbbmi)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.964 -1.384 -0.219  1.076 15.131 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                16.65712    0.05403  308.32   <2e-16 ***
## ns(age, knots = c(8, 18))1  4.30095    0.13069   32.91   <2e-16 ***
## ns(age, knots = c(8, 18))2  4.07593    0.22312   18.27   <2e-16 ***
## ns(age, knots = c(8, 18))3  7.21840    0.14023   51.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.129 on 7290 degrees of freedom
## Multiple R-squared:  0.4638, Adjusted R-squared:  0.4636 
## F-statistic:  2102 on 3 and 7290 DF,  p-value: < 2.2e-16

summary(m3 <- lm(bmi ~ ns(age, df = 3), data =dbbmi))

## 
## Call:
## lm(formula = bmi ~ ns(age, df = 3), data = dbbmi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9419 -1.3960 -0.2083  1.0917 15.3131 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      16.69942    0.06135  272.18   <2e-16 ***
## ns(age, df = 3)1  1.64189    0.12810   12.82   <2e-16 ***
## ns(age, df = 3)2  3.75807    0.20586   18.25   <2e-16 ***
## ns(age, df = 3)3  7.61545    0.11104   68.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.139 on 7290 degrees of freedom
## Multiple R-squared:  0.4592, Adjusted R-squared:  0.4589 
## F-statistic:  2063 on 3 and 7290 DF,  p-value: < 2.2e-16

# see where the knots are
attr(terms(m3), "predvars")

## list(bmi, ns(age, knots = c(3.3, 13.65), Boundary.knots = c(0.03, 
## 21.7), intercept = FALSE))

# plot data and mode fits
plot(bmi ~ age, data = dbbmi, col = "gray20", cex = .1,
     xlab = "Age", ylab = "bmi")
abline(v = c(8, 18), lty = 3)    # lty= line type, 3=dotted
grid()
lines(dbbmi$age, fitted(m1), col="darkred", lty = 2, lwd=1.5 )
lines(dbbmi$age, fitted(m2), col = "steelblue")
lines(dbbmi$age, fitted(m3), col = "skyblue")

# compare model fits
anova(m0, m1, m2, m3)

## Analysis of Variance Table
## 
## Model 1: bmi ~ age
## Model 2: bmi ~ age + a8 + a18
## Model 3: bmi ~ ns(age, knots = c(8, 18))
## Model 4: bmi ~ ns(age, df = 3)
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   7292 38522                                  
## 2   7290 33005  2    5516.4 609.22 < 2.2e-16 ***
## 3   7290 33057  0     -51.4                     
## 4   7290 33342  0    -285.1                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#m1 is the best

Q3 union membership, wage and gender

#relationship between union membership and wage and gender
#generalized additive model

require(AER)

## Loading required package: AER

## Warning: package 'AER' was built under R version 3.4.3

## Loading required package: car

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

data("CPS1985", package = "AER")
dat<-CPS1985
str(dat)

## 'data.frame':    534 obs. of  11 variables:
##  $ wage      : num  5.1 4.95 6.67 4 7.5 ...
##  $ education : num  8 9 12 12 12 13 10 12 16 12 ...
##  $ experience: num  21 42 1 4 17 9 27 9 11 9 ...
##  $ age       : num  35 57 19 22 35 28 43 27 33 27 ...
##  $ ethnicity : Factor w/ 3 levels "cauc","hispanic",..: 2 1 1 1 1 1 1 1 1 1 ...
##  $ region    : Factor w/ 2 levels "south","other": 2 2 2 2 2 2 1 2 2 2 ...
##  $ gender    : Factor w/ 2 levels "male","female": 2 2 1 1 1 1 1 1 1 1 ...
##  $ occupation: Factor w/ 6 levels "worker","technical",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sector    : Factor w/ 3 levels "manufacturing",..: 1 1 1 3 3 3 3 3 1 3 ...
##  $ union     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 1 1 ...
##  $ married   : Factor w/ 2 levels "no","yes": 2 2 1 1 2 1 1 1 2 1 ...

require(ggplot2)
#看一下男女有沒有加入公會與薪資的關係
ggplot(dat, aes(x = union, y = wage, color = gender))+
  stat_summary(fun.data = mean_se)+
  theme_bw()+
  labs(x = "union", y = "wage")

summary(m0 <- glm(union ~ gender + wage, data = dat, family = "binomial"))

## 
## Call:
## glm(formula = union ~ gender + wage, family = "binomial", data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2310  -0.6685  -0.5170  -0.4363   2.2196  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.80629    0.26013  -6.944 3.81e-12 ***
## genderfemale -0.74873    0.24908  -3.006  0.00265 ** 
## wage          0.06023    0.02025   2.974  0.00294 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 503.08  on 533  degrees of freedom
## Residual deviance: 481.00  on 531  degrees of freedom
## AIC: 487
## 
## Number of Fisher Scoring iterations: 4

require(effects)

## Loading required package: effects

## Warning: package 'effects' was built under R version 3.4.3

## Loading required package: carData

## 
## Attaching package: 'carData'

## The following objects are masked from 'package:car':
## 
##     Guyer, UN, Vocab

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

# effects plot
plot(effect("gender", mod = m0), ylab="Probability", xlab="gender", main="")

plot(effect("wage", mod = m0), ylab="Probability", xlab="wage", main="")

require(mgcv)

## Loading required package: mgcv

## This is mgcv 1.8-22. For overview type 'help("mgcv-package")'.

summary(m1 <- gam(union ~ gender+ s(wage, by=gender), data = dat, family = "binomial"))

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## union ~ gender + s(wage, by = gender)
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.4376     0.1752  -8.205  2.3e-16 ***
## genderfemale  -0.5666     0.2693  -2.104   0.0354 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df Chi.sq  p-value    
## s(wage):gendermale   3.431  4.262 21.059 0.000395 ***
## s(wage):genderfemale 2.081  2.623  6.731 0.068691 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0849   Deviance explained = 9.73%
## UBRE = -0.12143  Scale est. = 1         n = 534

anova(m0, m1, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: union ~ gender + wage
## Model 2: union ~ gender + s(wage, by = gender)
##   Resid. Df Resid. Dev     Df Deviance  Pr(>Chi)    
## 1    531.00     481.00                              
## 2    526.49     454.13 4.5122   26.868 3.685e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#m1 is better

Q4mortality rates in Sweden

#analysis in mortality rates in Sweden
#Mortality rate (count/exposure) follows a Poisson distribution(λ).
#log(count/exposure) = log(count) - log(exposure).
dat = read.csv("http://140.116.183.121/~sheu/lmm/Data/mortality_sweden.csv",sep=",", h = T)
str(dat)

## 'data.frame':    522 obs. of  4 variables:
##  $ Year    : int  1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 ...
##  $ Count   : int  20406 15944 15669 18365 16631 19237 21400 26392 21451 21873 ...
##  $ Exposure: num  562888 561540 553564 542766 536570 ...
##  $ Gender  : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...

require(lattice)

## Loading required package: lattice

require(mgcv)

xyplot(log(Count/Exposure) ~ Year, groups=Gender, data = dat,
       col = c("tomato", "steelblue"), type = c("p","g","smooth") )

# augument the exposure term in log
dat$le <- log(dat$Exposure)

# log(E[Yti]) = log(eti) + β0 + β1 genderi + s(yeart)
# s= definition of smooth terms within gam model formulae
summary(m0 <- gam(Count ~ offset(le) + Gender + s(Year), family = poisson, data = dat))

## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## Count ~ offset(le) + Gender + s(Year)
## 
## Parametric coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -4.0097738  0.0003343 -11995.9   <2e-16 ***
## GenderM      0.3013409  0.0004515    667.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df  Chi.sq p-value    
## s(Year) 8.991      9 5622270  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.755   Deviance explained = 94.1%
## UBRE = 750.16  Scale est. = 1         n = 522

#log(E[Yti]) = log(eti) + β0 + β1 genderi + si(yeart)
# different smoothing curves by gender
summary(m1 <- gam(Count ~ offset(le) + Gender + s(Year, by = Gender), family = poisson, data = dat))

## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## Count ~ offset(le) + Gender + s(Year, by = Gender)
## 
## Parametric coefficients:
##               Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -4.0064015  0.0003363 -11913.6   <2e-16 ***
## GenderM      0.2846730  0.0004641    613.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df  Chi.sq p-value    
## s(Year):GenderF 8.992      9 3091133  <2e-16 ***
## s(Year):GenderM 8.996      9 2559070  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.855   Deviance explained = 96.2%
## UBRE = 479.12  Scale est. = 1         n = 522

# overdispersion Poisson
summary(m2 <- gam(Count ~ offset(le) + Gender + s(Year, by = Gender),scale = -1,
          family = poisson, data = dat))

## 
## Family: poisson 
## Link function: log 
## 
## Formula:
## Count ~ offset(le) + Gender + s(Year, by = Gender)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.006341   0.007822 -512.20   <2e-16 ***
## GenderM      0.284659   0.010796   26.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df     F p-value    
## s(Year):GenderF 8.678  8.970 636.6  <2e-16 ***
## s(Year):GenderM 8.807  8.989 525.9  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.855   Deviance explained = 96.2%
## GCV = 518.54  Scale est. = 541.13    n = 522

anova(m0,m1,m2,test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: Count ~ offset(le) + Gender + s(Year)
## Model 2: Count ~ offset(le) + Gender + s(Year, by = Gender)
## Model 3: Count ~ offset(le) + Gender + s(Year, by = Gender)
##   Resid. Df Resid. Dev        Df Deviance  Pr(>Chi)    
## 1    511.00     392082                                 
## 2    502.00     250582  9.000001   141501 < 2.2e-16 ***
## 3    502.04     250845 -0.041649     -263 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#m1 is the best

#fitted values
dat$yhat <- fitted(m1)

require(ggplot2)
# m1預測值(線)與觀察值
ggplot(dat, aes(Year, Count/Exposure, color = Gender)) +
  geom_point(pch = 1) +
  geom_line(aes(Year, yhat/Exposure)) +
  guides(colour = guide_legend(reverse = T))+  #order of legends is reversed
  theme_bw() +
  theme()

Q5 Prestige Data

require(car)
data(Prestige)
dat<-Prestige
str(dat)

## 'data.frame':    102 obs. of  6 variables:
##  $ education: num  13.1 12.3 12.8 11.4 14.6 ...
##  $ income   : int  12351 25879 9271 8865 8403 11030 8258 14163 11377 11023 ...
##  $ women    : num  11.16 4.02 15.7 9.11 11.68 ...
##  $ prestige : num  68.8 69.1 63.4 56.8 73.5 77.6 72.6 78.1 73.1 68.8 ...
##  $ census   : int  1113 1130 1171 1175 2111 2113 2133 2141 2143 2153 ...
##  $ type     : Factor w/ 3 levels "bc","prof","wc": 2 2 2 2 2 2 2 2 2 2 ...

#依老師的圖示的label
dat$es[dat$education >=6.37 & dat$education <= 9.58]<- 1
dat$es[dat$education >9.58 & dat$education <= 12.8]<- 2
dat$es[dat$education >12.8 & dat$education <= 16]<- 3
dat$es<-as.factor(dat$es)
dat$es<-factor(dat$es, levels=c(1,2,3), labels=c("6.37-9.58", "9.58-12.8", "12.8-16"))
#
require(ggplot2)
ggplot(dat, aes(income, prestige))+
  geom_point(pch=1, colour="blue")+
  stat_smooth(se = F)+
  facet_wrap(~es)+
  theme_bw()

## `geom_smooth()` using method = 'loess'

#以收入的四分位數作為觀察類別
quantile(dat$income)

##       0%      25%      50%      75%     100% 
##   611.00  4106.00  5930.50  8187.25 25879.00

dat$ei[dat$income <=4106]<- 1
dat$ei[dat$income >4106 & dat$income <= 5930.5]<- 2
dat$ei[dat$income >5930.5 & dat$income <= 8187.25]<- 3
dat$ei[dat$income >8187.25 & dat$income <= 25879]<- 4
dat$ei<-factor(dat$ei, levels=c(1,2,3,4))
#
ggplot(dat, aes(education, prestige))+
  geom_point(pch=1, colour="blue")+
  stat_smooth(se = F)+
  facet_wrap(~ei,ncol = 4)+
  theme_bw()

## `geom_smooth()` using method = 'loess'

summary(m0 <- gam(prestige ~ s(income) + s(education), data = dat, family = gaussian))

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## prestige ~ s(income) + s(education)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  46.8333     0.6889   67.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F  p-value    
## s(income)    3.118  3.877 14.61 1.53e-09 ***
## s(education) 3.177  3.952 38.78  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.836   Deviance explained = 84.7%
## GCV = 52.143  Scale est. = 48.414    n = 102

plot(m0, pages = 1, shift = m0$coef[1])

vis.gam(m0, theta = 29, phi = 13, ticktype = "detailed", 
        se = 2)

Q6 kyphosis data

#kyphosis{rpart} data set 
require(rpart)

## Loading required package: rpart

data(kyphosis)
dat<-kyphosis
str(dat)

## 'data.frame':    81 obs. of  4 variables:
##  $ Kyphosis: Factor w/ 2 levels "absent","present": 1 1 2 1 1 1 1 1 1 2 ...
##  $ Age     : int  71 158 128 2 1 1 61 37 113 59 ...
##  $ Number  : int  3 3 4 5 4 2 2 3 2 6 ...
##  $ Start   : int  5 14 5 1 15 16 17 16 16 12 ...

summary(m0 <- glm(Kyphosis ~ Age + Number + Start,  data = dat, family = "binomial"))

## 
## Call:
## glm(formula = Kyphosis ~ Age + Number + Start, family = "binomial", 
##     data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3124  -0.5484  -0.3632  -0.1659   2.1613  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -2.036934   1.449575  -1.405  0.15996   
## Age          0.010930   0.006446   1.696  0.08996 . 
## Number       0.410601   0.224861   1.826  0.06785 . 
## Start       -0.206510   0.067699  -3.050  0.00229 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 83.234  on 80  degrees of freedom
## Residual deviance: 61.380  on 77  degrees of freedom
## AIC: 69.38
## 
## Number of Fisher Scoring iterations: 5

# effects plot
plot(effect("Age", mod = m0), ylab="Probability", xlab="Age", main="")

plot(effect("Number", mod = m0), ylab="Probability", xlab="Number", main="")

plot(effect("Start", mod = m0), ylab="Probability", xlab="Start", main="")

summary(m1 <- gam(Kyphosis ~ s(Age) + s(Start)+ Number , data = dat, family = "binomial"))

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Kyphosis ~ s(Age) + s(Start) + Number
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -3.5926     1.1464  -3.134  0.00172 **
## Number        0.3333     0.2324   1.434  0.15148   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df Chi.sq p-value  
## s(Age)   2.209  2.790  6.305  0.0795 .
## s(Start) 2.020  2.523  9.645  0.0146 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.355   Deviance explained = 39.4%
## UBRE = -0.22384  Scale est. = 1         n = 81

summary(m2 <- gam(Kyphosis ~ s(Age) + s(Start) , data = dat, family = "binomial"))

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Kyphosis ~ s(Age) + s(Start)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1723     0.4846  -4.483 7.37e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df Chi.sq p-value   
## s(Age)   2.183  2.761  6.346 0.08054 . 
## s(Start) 2.199  2.740 12.798 0.00421 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.33   Deviance explained = 37.1%
## UBRE = -0.2204  Scale est. = 1         n = 81

anova(m1)

## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## Kyphosis ~ s(Age) + s(Start) + Number
## 
## Parametric Terms:
##        df Chi.sq p-value
## Number  1  2.057   0.151
## 
## Approximate significance of smooth terms:
##            edf Ref.df Chi.sq p-value
## s(Age)   2.209  2.790  6.305  0.0795
## s(Start) 2.020  2.523  9.645  0.0146

#其實age不用配曲線(?)

anova(m0,m1, m2)

## Analysis of Deviance Table
## 
## Model 1: Kyphosis ~ Age + Number + Start
## Model 2: Kyphosis ~ s(Age) + s(Start) + Number
## Model 3: Kyphosis ~ s(Age) + s(Start)
##   Resid. Df Resid. Dev       Df Deviance
## 1    77.000     61.380                  
## 2    74.770     50.410  2.22951  10.9701
## 3    75.618     52.383 -0.84749  -1.9735

AIC(m0,m1, m2)

##          df      AIC
## m0 4.000000 69.37993
## m1 6.229507 62.86889
## m2 5.382020 63.14741

#m1 is better

vis.gam(m1, theta = 35, phi = 15, tick="detailed" )

vis.gam(m1, theta = -35, phi = 15, tick="detailed" )

Q7 ERP data

##Include subjects as random effects in the analysis of the ERP data example
require(erp.easy)

## Loading required package: erp.easy

## Warning: package 'erp.easy' was built under R version 3.4.3

data(ERPdata)
dat<-ERPdata
?ERPdata

## starting httpd help server ...

##  done

# extract only V6 data
dat <- ERPdata[, 1:4]
str(dat)

## 'data.frame':    13530 obs. of  4 variables:
##  $ Subject : Factor w/ 15 levels "S1","S10","S11",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Stimulus: Factor w/ 2 levels "Negative","Neutral": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Time    : int  -100 -96 -92 -88 -84 -80 -76 -72 -68 -64 ...
##  $ V6      : num  -2.025 -2.928 -0.975 -2.228 -3.478 ...

ggplot(dat, aes(Time, V6, group = Subject)) +
  stat_smooth(se = F, lwd = .1, alpha = .2) +
  facet_wrap( ~ Stimulus) +
  labs(x = "Time", y = "Amplitude") +
  theme_bw()

## `geom_smooth()` using method = 'loess'

# m1
summary(m1 <- gam(V6 ~ Stimulus+ s(Time, by = Stimulus), data = dat))

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## V6 ~ Stimulus + s(Time, by = Stimulus)
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.84985    0.07672  -37.15   <2e-16 ***
## StimulusNeutral  1.84999    0.10850   17.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                            edf Ref.df     F p-value    
## s(Time):StimulusNegative 7.922  8.696 618.6  <2e-16 ***
## s(Time):StimulusNeutral  8.586  8.951 540.8  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.437   Deviance explained = 43.8%
## GCV = 39.876  Scale est. = 39.821    n = 13530

plot(m1, pages = 1, shift = m1$coef[1])

vis.gam(m1, theta = 29, phi = 13, ticktype = "detailed", se = 2)

# m2 - Include subjects as random effects
summary(m2 <- gam(V6 ~ Stimulus+ s(Time, by = Stimulus)+ s(Subject, bs = "re"), data = dat))

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## V6 ~ Stimulus + s(Time, by = Stimulus) + s(Subject, bs = "re")
## 
## Parametric coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.84985    0.70691  -4.031 5.58e-05 ***
## StimulusNeutral  1.84999    0.09858  18.767  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df     F p-value    
## s(Time):StimulusNegative  8.041  8.755 744.7  <2e-16 ***
## s(Time):StimulusNeutral   8.651  8.965 654.4  <2e-16 ***
## s(Subject)               13.932 14.000 204.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.536   Deviance explained = 53.7%
## GCV = 32.948  Scale est. = 32.869    n = 13530

plot(m2, pages = 1, shift = m2$coef[1])

vis.gam(m2, theta = 29, phi = 13, ticktype = "detailed", se = 2)

Q8 skin conductance data

##gneralized additive mixed model
 #skin conductance data
pacman::p_load(tidyverse, mgcv, gamm4, ElemStatLearn, itsadug)
dat<-read.csv("http://140.116.183.121/~sheu/lmm/Data/skinPotentials.csv", sep=",",header = T)
str(dat)

## 'data.frame':    143 obs. of  3 variables:
##  $ ID       : Factor w/ 13 levels "S101","S102",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Time     : int  0 50 100 150 200 250 300 350 400 450 ...
##  $ Amplitude: num  9.5 10 11 11 11 10.5 10 9.5 9.5 9.5 ...

ggplot(dat, aes(Time, Amplitude))+
  geom_point()+
  facet_wrap(~ID)+
  theme_bw()

ggplot(dat, aes(Time, Amplitude))+
  geom_point(color = "gray")+
  geom_smooth(color = "gray")+
  stat_summary(fun.y = mean, geom = "point")+
  stat_summary(fun.data = mean_se, geom = "errorbar",width = 0)+
  theme_bw()

## `geom_smooth()` using method = 'loess'

# Analysis of repeated interaction experiments (ID) using GAMs
## Fit the model with a non-linear effect over time:
summary(m1 <- bam(Amplitude ~ s(Time) +             # fixed effects
                    s(Time, ID, bs = 'fs', m = 1),  # random effects
                    # fs= smooth for each one in a study, and each smooth should have the same smoothing parameter        
                    data = dat))

## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has
## repeated 1-d smooths of same variable.

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Amplitude ~ s(Time) + s(Time, ID, bs = "fs", m = 1)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.9715     0.4795   18.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf  Ref.df     F p-value    
## s(Time)     7.314   7.924 23.79  <2e-16 ***
## s(Time,ID) 80.472 116.000 20.86  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.962   Deviance explained = 98.6%
## fREML = 217.74  Scale est. = 0.22943   n = 143

#
par(mfrow=c(1,2))

## Plot effect of t(ime)
plot_smooth(m1, 'Time', rm.ranef = T, main = 'Effect of time')

## Summary:
##  * Time : numeric predictor; with 30 values ranging from 0.000000 to 500.000000. 
##  * ID : factor; set to the value(s): S101. (Might be canceled as random effect, check below.) 
##  * NOTE : The following random effects columns are canceled: s(Time,ID)
##

## Plot individual variation:
plot(m1, select = 2)

anova(m1)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Amplitude ~ s(Time) + s(Time, ID, bs = "fs", m = 1)
## 
## Approximate significance of smooth terms:
##                edf  Ref.df     F p-value
## s(Time)      7.314   7.924 23.79  <2e-16
## s(Time,ID)  80.472 116.000 20.86  <2e-16

#試試看
## Define starting point for each time series:
dat2 <- start_event(dat, column = 'Time', event = 'ID')

# The amount of autocorrelation is set to the height of the second bar:
rhoval <- acf_resid(m1)[2]

## And we refit the model:
summary(m2 <- bam(Amplitude ~ s(Time) +                       # fixed effects
                    s(Time, ID, bs = 'fs', m = 1),            # random effects
                  data = dat2,
                  rho = rhoval, AR.start = start.event        # autocorrelation parameters
))

## Warning in gam.side(sm, X, tol = .Machine$double.eps^0.5): model has
## repeated 1-d smooths of same variable.

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Amplitude ~ s(Time) + s(Time, ID, bs = "fs", m = 1)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.967      0.484   18.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##              edf  Ref.df      F p-value    
## s(Time)     7.39   7.819  22.89  <2e-16 ***
## s(Time,ID) 95.69 116.000 116.41  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.967   Deviance explained = 99.1%
## fREML = 218.07  Scale est. = 0.10873   n = 143

#
acf_resid(m2)

#其實m1m2在此case中差不多?

Semiparametric mixed models

Li

2017年12月11日

Q2 BMI

Q3 union membership, wage and gender

Q4mortality rates in Sweden

Q5 Prestige Data

Q6 kyphosis data

Q7 ERP data

Q8 skin conductance data