#peijun
Refer to the questions here.
library(gamlss)
dta2 <- dbbmi
# plot
ggplot(dta2, aes(age, bmi)) +
geom_point(alpha = .3, size = rel(.8)) +
stat_smooth(method = "gam", formula = y ~ s(x))
# model
m0 <- lm(bmi ~ age, data = dta2)
summary(m0s <- segmented(m0, z = ~ age))
***Regression Model with Segmented Relationship(s)***
Call:
segmented.lm(obj = m0, z = ~age)
Estimated Break-Point(s):
Est. St.Err
8.065 0.222
Meaningful coefficients of the linear terms:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.53511 0.05527 299.176 < 2e-16 ***
age -0.09700 0.01942 -4.995 6.02e-07 ***
U1.age 0.64092 0.02180 29.398 NA
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.13 on 7290 degrees of freedom
Multiple R-Squared: 0.4636, Adjusted R-squared: 0.4634
Convergence attained in 5 iterations with relative change 8.169368e-06
rhs <- function(x, k) ifelse(x > k, x - k, 0)
summary(m0p2 <- lm(bmi ~ age + rhs(age, m0s$psi[2]), data = dta2))
Call:
lm(formula = bmi ~ age + rhs(age, m0s$psi[2]), data = dta2)
Residuals:
Min 1Q Median 3Q Max
-5.8572 -1.3725 -0.2234 1.0774 15.2086
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 16.53511 0.05071 326.046 <2e-16 ***
age -0.09700 0.01105 -8.779 <2e-16 ***
rhs(age, m0s$psi[2]) 0.64092 0.01848 34.683 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.13 on 7291 degrees of freedom
Multiple R-squared: 0.4636, Adjusted R-squared: 0.4635
F-statistic: 3151 on 2 and 7291 DF, p-value: < 2.2e-16
data("CPS1985", package = "AER")
dta3 <- CPS1985
ggplot(dta3, aes(x = union, y = wage)) +
geom_bar(stat = "identity") +
facet_wrap(~ gender)
# model
summary(m0 <- glm(union ~ wage + gender, data = dta3, family = "binomial"))
Call:
glm(formula = union ~ wage + gender, family = "binomial", data = dta3)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.2310 -0.6685 -0.5170 -0.4363 2.2196
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.80629 0.26013 -6.944 3.81e-12 ***
wage 0.06023 0.02025 2.974 0.00294 **
genderfemale -0.74873 0.24908 -3.006 0.00265 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 503.08 on 533 degrees of freedom
Residual deviance: 481.00 on 531 degrees of freedom
AIC: 487
Number of Fisher Scoring iterations: 4
plot(effect("wage", mod = m0), ylab = "Probability", xlab = "wage")
plot(effect("gender", mod = m0), ylab = "Probability", xlab = "gender")
summary(m1 <- gam(union ~ gender + s(wage, by = gender), data = dta3, family = "binomial"))
Family: binomial
Link function: logit
Formula:
union ~ gender + s(wage, by = gender)
Parametric coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.4376 0.1752 -8.205 2.3e-16 ***
genderfemale -0.5666 0.2693 -2.104 0.0354 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df Chi.sq p-value
s(wage):gendermale 3.431 4.262 21.059 0.000395 ***
s(wage):genderfemale 2.081 2.623 6.731 0.068691 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) = 0.0849 Deviance explained = 9.73%
UBRE = -0.12143 Scale est. = 1 n = 534
anova(m0, m1)
Analysis of Deviance Table
Model 1: union ~ wage + gender
Model 2: union ~ gender + s(wage, by = gender)
Resid. Df Resid. Dev Df Deviance
1 531.00 481.00
2 526.49 454.13 4.5122 26.868
dta4 <- read.table("mortality_sweden.csv", h = T, sep = ",")
dta4$logE <- log(dta4$Exposure)
# plot
ggplot(dta4, aes(x = Year, y = log(Count/Exposure), group = Gender, color = Gender)) +
geom_point() +
stat_smooth(method = "auto") +
theme(legend.position = c(.1, .1))
`geom_smooth()` using method = 'loess'
# model
summary(m0 <- gam(Count ~ offset(logE) + Gender + s(Year), family = "poisson", data = dta4))
Family: poisson
Link function: log
Formula:
Count ~ offset(logE) + Gender + s(Year)
Parametric coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.0097738 0.0003343 -11995.9 <2e-16 ***
GenderM 0.3013409 0.0004515 667.5 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df Chi.sq p-value
s(Year) 8.991 9 5622270 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) = 0.755 Deviance explained = 94.1%
UBRE = 750.16 Scale est. = 1 n = 522
# by gender
summary(m1 <- gam(Count ~ offset(logE) + Gender + s(Year, by = Gender), family = "poisson", data = dta4))
Family: poisson
Link function: log
Formula:
Count ~ offset(logE) + Gender + s(Year, by = Gender)
Parametric coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.0064015 0.0003363 -11913.6 <2e-16 ***
GenderM 0.2846730 0.0004641 613.3 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df Chi.sq p-value
s(Year):GenderF 8.992 9 3091133 <2e-16 ***
s(Year):GenderM 8.996 9 2559070 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) = 0.855 Deviance explained = 96.2%
UBRE = 479.12 Scale est. = 1 n = 522
# overdispersion
summary(m2 <- gam(Count ~ offset(logE) + Gender + s(Year, by = Gender),scale = -1, family = poisson, data = dta4))
Family: poisson
Link function: log
Formula:
Count ~ offset(logE) + Gender + s(Year, by = Gender)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.006341 0.007822 -512.20 <2e-16 ***
GenderM 0.284659 0.010796 26.37 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(Year):GenderF 8.678 8.970 636.6 <2e-16 ***
s(Year):GenderM 8.807 8.989 525.9 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) = 0.855 Deviance explained = 96.2%
GCV = 518.54 Scale est. = 541.13 n = 522
anova(m0, m1, m2)
Analysis of Deviance Table
Model 1: Count ~ offset(logE) + Gender + s(Year)
Model 2: Count ~ offset(logE) + Gender + s(Year, by = Gender)
Model 3: Count ~ offset(logE) + Gender + s(Year, by = Gender)
Resid. Df Resid. Dev Df Deviance
1 511.00 392082
2 502.00 250582 9.000001 141501
3 502.04 250845 -0.041649 -263
# plot (m2 fit most)
dta4$yhat <- fitted(m2)
ggplot(dta4, aes(Year, log(Count/Exposure), color = Gender)) +
geom_point() +
geom_line(aes(Year, log(yhat/Exposure)), size = 1) +
theme(legend.position = c(.1, .1))
library(car)
dta5 <- Prestige
# segment
dta5$Edu <- as.factor(ifelse(dta5$education <= 8.936, "low",
ifelse(dta5$education <= 11.473, "med", "high")))
dta5$Edu <- factor(dta5$Edu, levels = c("low", "med", "high")) #low = 1, med = 2, high =3
#plot
ggplot(dta5, aes(x = income, y = prestige)) +
geom_point(pch = 1) +
stat_smooth(se = F) +
facet_wrap(~ Edu) +
theme_bw()
`geom_smooth()` using method = 'loess'
# four quantile
quantile(dta5$income)
0% 25% 50% 75% 100%
611.00 4106.00 5930.50 8187.25 25879.00
dta5$Inc <- as.factor(ifelse(dta5$income <= 4106, "1",
ifelse(dta5$income <= 5930, "2",
ifelse(dta5$income <= 8187, "3", "4"))))
# plot
ggplot(dta5, aes(education, prestige))+
geom_point(pch = 1)+
stat_smooth(se = F)+
facet_wrap(~Inc, nrow = 1)+
theme_bw()
`geom_smooth()` using method = 'loess'
# model
summary(m0 <- gam(prestige ~ s(income) + s(education), data = dta5, family = "gaussian"))
Family: gaussian
Link function: identity
Formula:
prestige ~ s(income) + s(education)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 46.8333 0.6889 67.98 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(income) 3.118 3.877 14.61 1.53e-09 ***
s(education) 3.177 3.952 38.78 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) = 0.836 Deviance explained = 84.7%
GCV = 52.143 Scale est. = 48.414 n = 102
# plot
vis.gam(m0, theta = 29, phi = 13, ticktype = "detailed", se = 2)
plot(m0, pages = 1, shift = m0$coef[1])
dta6 <- kyphosis
summary(m0 <- glm(Kyphosis ~ Age + Number + Start, data = dta6, family = "binomial"))
Call:
glm(formula = Kyphosis ~ Age + Number + Start, family = "binomial",
data = dta6)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3124 -0.5484 -0.3632 -0.1659 2.1613
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.036934 1.449575 -1.405 0.15996
Age 0.010930 0.006446 1.696 0.08996 .
Number 0.410601 0.224861 1.826 0.06785 .
Start -0.206510 0.067699 -3.050 0.00229 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 83.234 on 80 degrees of freedom
Residual deviance: 61.380 on 77 degrees of freedom
AIC: 69.38
Number of Fisher Scoring iterations: 5
plot(effect("Number", mod = m0), ylab="Probability", xlab="Number", main="")
plot(effect("Age", mod = m0), ylab="Probability", xlab="Age", main="")
plot(effect("Start", mod = m0), ylab="Probability", xlab="Start", main="")
summary(m2 <- gam(Kyphosis ~ s(Age) + Number + s(Start), data = dta6, family = "binomial"))
Family: binomial
Link function: logit
Formula:
Kyphosis ~ s(Age) + Number + s(Start)
Parametric coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.5926 1.1464 -3.134 0.00172 **
Number 0.3333 0.2324 1.434 0.15148
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df Chi.sq p-value
s(Age) 2.209 2.790 6.305 0.0795 .
s(Start) 2.020 2.523 9.645 0.0146 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
R-sq.(adj) = 0.355 Deviance explained = 39.4%
UBRE = -0.22384 Scale est. = 1 n = 81
anova(m2)
Family: binomial
Link function: logit
Formula:
Kyphosis ~ s(Age) + Number + s(Start)
Parametric Terms:
df Chi.sq p-value
Number 1 2.057 0.151
Approximate significance of smooth terms:
edf Ref.df Chi.sq p-value
s(Age) 2.209 2.790 6.305 0.0795
s(Start) 2.020 2.523 9.645 0.0146
vis.gam(m2, theta = 35, phi = 15, tick="detailed", xlab = "Start", ylab = "Age")
vis.gam(m2, theta = -35, phi = 15, tick="detailed", xlab = "Start", ylab = "Age")