#peijun

Refer to the questions here.

Q2 Body mass (BMI)

library(gamlss)
dta2 <- dbbmi

# plot
ggplot(dta2, aes(age, bmi)) +
  geom_point(alpha = .3, size = rel(.8)) +
  stat_smooth(method = "gam", formula = y ~ s(x))

# model
m0 <- lm(bmi ~ age, data = dta2)
summary(m0s <- segmented(m0, z = ~ age))

    ***Regression Model with Segmented Relationship(s)***

Call: 
segmented.lm(obj = m0, z = ~age)

Estimated Break-Point(s):
   Est. St.Err 
 8.065  0.222 

Meaningful coefficients of the linear terms:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 16.53511    0.05527 299.176  < 2e-16 ***
age         -0.09700    0.01942  -4.995 6.02e-07 ***
U1.age       0.64092    0.02180  29.398       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.13 on 7290 degrees of freedom
Multiple R-Squared: 0.4636,  Adjusted R-squared: 0.4634 

Convergence attained in 5 iterations with relative change 8.169368e-06 
rhs <- function(x, k) ifelse(x > k, x - k, 0)
summary(m0p2 <- lm(bmi ~ age + rhs(age, m0s$psi[2]), data = dta2))

Call:
lm(formula = bmi ~ age + rhs(age, m0s$psi[2]), data = dta2)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.8572 -1.3725 -0.2234  1.0774 15.2086 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          16.53511    0.05071 326.046   <2e-16 ***
age                  -0.09700    0.01105  -8.779   <2e-16 ***
rhs(age, m0s$psi[2])  0.64092    0.01848  34.683   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.13 on 7291 degrees of freedom
Multiple R-squared:  0.4636,    Adjusted R-squared:  0.4635 
F-statistic:  3151 on 2 and 7291 DF,  p-value: < 2.2e-16

Q3 Union, wage, gender

data("CPS1985", package = "AER")
dta3 <- CPS1985

ggplot(dta3, aes(x = union, y = wage)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ gender)

# model
summary(m0 <- glm(union ~ wage + gender, data = dta3, family = "binomial"))

Call:
glm(formula = union ~ wage + gender, family = "binomial", data = dta3)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.2310  -0.6685  -0.5170  -0.4363   2.2196  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.80629    0.26013  -6.944 3.81e-12 ***
wage          0.06023    0.02025   2.974  0.00294 ** 
genderfemale -0.74873    0.24908  -3.006  0.00265 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 503.08  on 533  degrees of freedom
Residual deviance: 481.00  on 531  degrees of freedom
AIC: 487

Number of Fisher Scoring iterations: 4
plot(effect("wage", mod = m0), ylab = "Probability", xlab = "wage")

plot(effect("gender", mod = m0), ylab = "Probability", xlab = "gender")

summary(m1 <- gam(union ~ gender + s(wage, by = gender), data = dta3, family = "binomial"))

Family: binomial 
Link function: logit 

Formula:
union ~ gender + s(wage, by = gender)

Parametric coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)   -1.4376     0.1752  -8.205  2.3e-16 ***
genderfemale  -0.5666     0.2693  -2.104   0.0354 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                       edf Ref.df Chi.sq  p-value    
s(wage):gendermale   3.431  4.262 21.059 0.000395 ***
s(wage):genderfemale 2.081  2.623  6.731 0.068691 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.0849   Deviance explained = 9.73%
UBRE = -0.12143  Scale est. = 1         n = 534
anova(m0, m1)
Analysis of Deviance Table

Model 1: union ~ wage + gender
Model 2: union ~ gender + s(wage, by = gender)
  Resid. Df Resid. Dev     Df Deviance
1    531.00     481.00                
2    526.49     454.13 4.5122   26.868

Q4 Mortality rate in Sweden

dta4 <- read.table("mortality_sweden.csv", h = T, sep = ",")
dta4$logE <- log(dta4$Exposure)

# plot
ggplot(dta4, aes(x = Year, y = log(Count/Exposure), group = Gender, color = Gender)) +
      geom_point() +
      stat_smooth(method = "auto") +
      theme(legend.position = c(.1, .1))
`geom_smooth()` using method = 'loess'

# model
summary(m0 <- gam(Count ~ offset(logE) + Gender + s(Year), family = "poisson", data = dta4))

Family: poisson 
Link function: log 

Formula:
Count ~ offset(logE) + Gender + s(Year)

Parametric coefficients:
              Estimate Std. Error  z value Pr(>|z|)    
(Intercept) -4.0097738  0.0003343 -11995.9   <2e-16 ***
GenderM      0.3013409  0.0004515    667.5   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
          edf Ref.df  Chi.sq p-value    
s(Year) 8.991      9 5622270  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.755   Deviance explained = 94.1%
UBRE = 750.16  Scale est. = 1         n = 522
# by gender
summary(m1 <- gam(Count ~ offset(logE) + Gender + s(Year, by = Gender), family = "poisson", data = dta4))

Family: poisson 
Link function: log 

Formula:
Count ~ offset(logE) + Gender + s(Year, by = Gender)

Parametric coefficients:
              Estimate Std. Error  z value Pr(>|z|)    
(Intercept) -4.0064015  0.0003363 -11913.6   <2e-16 ***
GenderM      0.2846730  0.0004641    613.3   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                  edf Ref.df  Chi.sq p-value    
s(Year):GenderF 8.992      9 3091133  <2e-16 ***
s(Year):GenderM 8.996      9 2559070  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.855   Deviance explained = 96.2%
UBRE = 479.12  Scale est. = 1         n = 522
# overdispersion
summary(m2 <- gam(Count ~ offset(logE) + Gender + s(Year, by = Gender),scale = -1, family = poisson, data = dta4))

Family: poisson 
Link function: log 

Formula:
Count ~ offset(logE) + Gender + s(Year, by = Gender)

Parametric coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.006341   0.007822 -512.20   <2e-16 ***
GenderM      0.284659   0.010796   26.37   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                  edf Ref.df     F p-value    
s(Year):GenderF 8.678  8.970 636.6  <2e-16 ***
s(Year):GenderM 8.807  8.989 525.9  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.855   Deviance explained = 96.2%
GCV = 518.54  Scale est. = 541.13    n = 522
anova(m0, m1, m2)
Analysis of Deviance Table

Model 1: Count ~ offset(logE) + Gender + s(Year)
Model 2: Count ~ offset(logE) + Gender + s(Year, by = Gender)
Model 3: Count ~ offset(logE) + Gender + s(Year, by = Gender)
  Resid. Df Resid. Dev        Df Deviance
1    511.00     392082                   
2    502.00     250582  9.000001   141501
3    502.04     250845 -0.041649     -263
# plot (m2 fit most)
dta4$yhat <- fitted(m2)
ggplot(dta4, aes(Year, log(Count/Exposure), color = Gender)) +
       geom_point() +
       geom_line(aes(Year, log(yhat/Exposure)), size = 1) +
       theme(legend.position = c(.1, .1))

Q5 Canadian prestige

library(car)
dta5 <- Prestige

# segment
dta5$Edu <- as.factor(ifelse(dta5$education <= 8.936, "low",
                            ifelse(dta5$education <= 11.473, "med", "high")))
dta5$Edu <- factor(dta5$Edu, levels = c("low", "med", "high"))  #low = 1, med = 2, high =3

#plot
ggplot(dta5, aes(x = income, y = prestige)) +
       geom_point(pch = 1) +
       stat_smooth(se = F) +
       facet_wrap(~ Edu) +
       theme_bw()
`geom_smooth()` using method = 'loess'

# four quantile
quantile(dta5$income)
      0%      25%      50%      75%     100% 
  611.00  4106.00  5930.50  8187.25 25879.00 
dta5$Inc <- as.factor(ifelse(dta5$income <= 4106, "1",
                               ifelse(dta5$income <= 5930, "2",
                                      ifelse(dta5$income <= 8187, "3", "4"))))

# plot
ggplot(dta5, aes(education, prestige))+
  geom_point(pch = 1)+
  stat_smooth(se = F)+
  facet_wrap(~Inc, nrow = 1)+
  theme_bw()
`geom_smooth()` using method = 'loess'

# model
summary(m0 <- gam(prestige ~ s(income) + s(education), data = dta5, family = "gaussian"))

Family: gaussian 
Link function: identity 

Formula:
prestige ~ s(income) + s(education)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  46.8333     0.6889   67.98   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
               edf Ref.df     F  p-value    
s(income)    3.118  3.877 14.61 1.53e-09 ***
s(education) 3.177  3.952 38.78  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.836   Deviance explained = 84.7%
GCV = 52.143  Scale est. = 48.414    n = 102
# plot
vis.gam(m0, theta = 29, phi = 13, ticktype = "detailed", se = 2)

plot(m0, pages = 1, shift = m0$coef[1])

Q6 Kyphosis

dta6 <- kyphosis

summary(m0 <- glm(Kyphosis ~ Age + Number + Start, data = dta6, family = "binomial"))

Call:
glm(formula = Kyphosis ~ Age + Number + Start, family = "binomial", 
    data = dta6)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.3124  -0.5484  -0.3632  -0.1659   2.1613  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept) -2.036934   1.449575  -1.405  0.15996   
Age          0.010930   0.006446   1.696  0.08996 . 
Number       0.410601   0.224861   1.826  0.06785 . 
Start       -0.206510   0.067699  -3.050  0.00229 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 83.234  on 80  degrees of freedom
Residual deviance: 61.380  on 77  degrees of freedom
AIC: 69.38

Number of Fisher Scoring iterations: 5
plot(effect("Number", mod = m0), ylab="Probability", xlab="Number", main="")

plot(effect("Age", mod = m0), ylab="Probability", xlab="Age", main="")

plot(effect("Start", mod = m0), ylab="Probability", xlab="Start", main="")

summary(m2 <- gam(Kyphosis ~ s(Age) + Number + s(Start), data = dta6, family = "binomial"))

Family: binomial 
Link function: logit 

Formula:
Kyphosis ~ s(Age) + Number + s(Start)

Parametric coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  -3.5926     1.1464  -3.134  0.00172 **
Number        0.3333     0.2324   1.434  0.15148   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
           edf Ref.df Chi.sq p-value  
s(Age)   2.209  2.790  6.305  0.0795 .
s(Start) 2.020  2.523  9.645  0.0146 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.355   Deviance explained = 39.4%
UBRE = -0.22384  Scale est. = 1         n = 81
anova(m2)

Family: binomial 
Link function: logit 

Formula:
Kyphosis ~ s(Age) + Number + s(Start)

Parametric Terms:
       df Chi.sq p-value
Number  1  2.057   0.151

Approximate significance of smooth terms:
           edf Ref.df Chi.sq p-value
s(Age)   2.209  2.790  6.305  0.0795
s(Start) 2.020  2.523  9.645  0.0146
vis.gam(m2, theta = 35, phi = 15, tick="detailed", xlab = "Start", ylab = "Age")

vis.gam(m2, theta = -35, phi = 15, tick="detailed", xlab = "Start", ylab = "Age")