1204_Semipar

#peijun

Refer to the questions here.

Q2 Body mass (BMI)

library(gamlss) dta2 <- dbbmi # plot ggplot(dta2, aes(age, bmi)) + geom_point(alpha = .3, size = rel(.8)) + stat_smooth(method = "gam", formula = y ~ s(x))

# model m0 <- lm(bmi ~ age, data = dta2) summary(m0s <- segmented(m0, z = ~ age))

***Regression Model with Segmented Relationship(s)*** Call: segmented.lm(obj = m0, z = ~age) Estimated Break-Point(s): Est. St.Err 8.065 0.222 Meaningful coefficients of the linear terms: Estimate Std. Error t value Pr(>|t|) (Intercept) 16.53511 0.05527 299.176 < 2e-16 *** age -0.09700 0.01942 -4.995 6.02e-07 *** U1.age 0.64092 0.02180 29.398 NA --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 2.13 on 7290 degrees of freedom Multiple R-Squared: 0.4636, Adjusted R-squared: 0.4634 Convergence attained in 5 iterations with relative change 8.169368e-06

rhs <- function(x, k) ifelse(x > k, x - k, 0) summary(m0p2 <- lm(bmi ~ age + rhs(age, m0s$psi[2]), data = dta2))

Call: lm(formula = bmi ~ age + rhs(age, m0s$psi[2]), data = dta2) Residuals: Min 1Q Median 3Q Max -5.8572 -1.3725 -0.2234 1.0774 15.2086 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 16.53511 0.05071 326.046 <2e-16 *** age -0.09700 0.01105 -8.779 <2e-16 *** rhs(age, m0s$psi[2]) 0.64092 0.01848 34.683 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 2.13 on 7291 degrees of freedom Multiple R-squared: 0.4636, Adjusted R-squared: 0.4635 F-statistic: 3151 on 2 and 7291 DF, p-value: < 2.2e-16

Q3 Union, wage, gender

data("CPS1985", package = "AER") dta3 <- CPS1985 ggplot(dta3, aes(x = union, y = wage)) + geom_bar(stat = "identity") + facet_wrap(~ gender)

# model summary(m0 <- glm(union ~ wage + gender, data = dta3, family = "binomial"))

Call: glm(formula = union ~ wage + gender, family = "binomial", data = dta3) Deviance Residuals: Min 1Q Median 3Q Max -1.2310 -0.6685 -0.5170 -0.4363 2.2196 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.80629 0.26013 -6.944 3.81e-12 *** wage 0.06023 0.02025 2.974 0.00294 ** genderfemale -0.74873 0.24908 -3.006 0.00265 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 503.08 on 533 degrees of freedom Residual deviance: 481.00 on 531 degrees of freedom AIC: 487 Number of Fisher Scoring iterations: 4

plot(effect("wage", mod = m0), ylab = "Probability", xlab = "wage")

plot(effect("gender", mod = m0), ylab = "Probability", xlab = "gender")

summary(m1 <- gam(union ~ gender + s(wage, by = gender), data = dta3, family = "binomial"))

Family: binomial Link function: logit Formula: union ~ gender + s(wage, by = gender) Parametric coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1.4376 0.1752 -8.205 2.3e-16 *** genderfemale -0.5666 0.2693 -2.104 0.0354 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df Chi.sq p-value s(wage):gendermale 3.431 4.262 21.059 0.000395 *** s(wage):genderfemale 2.081 2.623 6.731 0.068691 . --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.0849 Deviance explained = 9.73% UBRE = -0.12143 Scale est. = 1 n = 534

anova(m0, m1)

Analysis of Deviance Table Model 1: union ~ wage + gender Model 2: union ~ gender + s(wage, by = gender) Resid. Df Resid. Dev Df Deviance 1 531.00 481.00 2 526.49 454.13 4.5122 26.868

Q4 Mortality rate in Sweden

dta4 <- read.table("mortality_sweden.csv", h = T, sep = ",") dta4$logE <- log(dta4$Exposure) # plot ggplot(dta4, aes(x = Year, y = log(Count/Exposure), group = Gender, color = Gender)) + geom_point() + stat_smooth(method = "auto") + theme(legend.position = c(.1, .1))

`geom_smooth()` using method = 'loess'

# model summary(m0 <- gam(Count ~ offset(logE) + Gender + s(Year), family = "poisson", data = dta4))

Family: poisson Link function: log Formula: Count ~ offset(logE) + Gender + s(Year) Parametric coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.0097738 0.0003343 -11995.9 <2e-16 *** GenderM 0.3013409 0.0004515 667.5 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df Chi.sq p-value s(Year) 8.991 9 5622270 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.755 Deviance explained = 94.1% UBRE = 750.16 Scale est. = 1 n = 522

# by gender summary(m1 <- gam(Count ~ offset(logE) + Gender + s(Year, by = Gender), family = "poisson", data = dta4))

Family: poisson Link function: log Formula: Count ~ offset(logE) + Gender + s(Year, by = Gender) Parametric coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.0064015 0.0003363 -11913.6 <2e-16 *** GenderM 0.2846730 0.0004641 613.3 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df Chi.sq p-value s(Year):GenderF 8.992 9 3091133 <2e-16 *** s(Year):GenderM 8.996 9 2559070 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.855 Deviance explained = 96.2% UBRE = 479.12 Scale est. = 1 n = 522

# overdispersion summary(m2 <- gam(Count ~ offset(logE) + Gender + s(Year, by = Gender),scale = -1, family = poisson, data = dta4))

Family: poisson Link function: log Formula: Count ~ offset(logE) + Gender + s(Year, by = Gender) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -4.006341 0.007822 -512.20 <2e-16 *** GenderM 0.284659 0.010796 26.37 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df F p-value s(Year):GenderF 8.678 8.970 636.6 <2e-16 *** s(Year):GenderM 8.807 8.989 525.9 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.855 Deviance explained = 96.2% GCV = 518.54 Scale est. = 541.13 n = 522

anova(m0, m1, m2)

Analysis of Deviance Table Model 1: Count ~ offset(logE) + Gender + s(Year) Model 2: Count ~ offset(logE) + Gender + s(Year, by = Gender) Model 3: Count ~ offset(logE) + Gender + s(Year, by = Gender) Resid. Df Resid. Dev Df Deviance 1 511.00 392082 2 502.00 250582 9.000001 141501 3 502.04 250845 -0.041649 -263

# plot (m2 fit most) dta4$yhat <- fitted(m2) ggplot(dta4, aes(Year, log(Count/Exposure), color = Gender)) + geom_point() + geom_line(aes(Year, log(yhat/Exposure)), size = 1) + theme(legend.position = c(.1, .1))

Q5 Canadian prestige

library(car) dta5 <- Prestige # segment dta5$Edu <- as.factor(ifelse(dta5$education <= 8.936, "low", ifelse(dta5$education <= 11.473, "med", "high"))) dta5$Edu <- factor(dta5$Edu, levels = c("low", "med", "high")) #low = 1, med = 2, high =3 #plot ggplot(dta5, aes(x = income, y = prestige)) + geom_point(pch = 1) + stat_smooth(se = F) + facet_wrap(~ Edu) + theme_bw()

`geom_smooth()` using method = 'loess'

# four quantile quantile(dta5$income)

0% 25% 50% 75% 100% 611.00 4106.00 5930.50 8187.25 25879.00

dta5$Inc <- as.factor(ifelse(dta5$income <= 4106, "1", ifelse(dta5$income <= 5930, "2", ifelse(dta5$income <= 8187, "3", "4")))) # plot ggplot(dta5, aes(education, prestige))+ geom_point(pch = 1)+ stat_smooth(se = F)+ facet_wrap(~Inc, nrow = 1)+ theme_bw()

`geom_smooth()` using method = 'loess'

# model summary(m0 <- gam(prestige ~ s(income) + s(education), data = dta5, family = "gaussian"))

Family: gaussian Link function: identity Formula: prestige ~ s(income) + s(education) Parametric coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 46.8333 0.6889 67.98 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df F p-value s(income) 3.118 3.877 14.61 1.53e-09 *** s(education) 3.177 3.952 38.78 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.836 Deviance explained = 84.7% GCV = 52.143 Scale est. = 48.414 n = 102

# plot vis.gam(m0, theta = 29, phi = 13, ticktype = "detailed", se = 2)

plot(m0, pages = 1, shift = m0$coef[1])

Q6 Kyphosis

dta6 <- kyphosis summary(m0 <- glm(Kyphosis ~ Age + Number + Start, data = dta6, family = "binomial"))

Call: glm(formula = Kyphosis ~ Age + Number + Start, family = "binomial", data = dta6) Deviance Residuals: Min 1Q Median 3Q Max -2.3124 -0.5484 -0.3632 -0.1659 2.1613 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -2.036934 1.449575 -1.405 0.15996 Age 0.010930 0.006446 1.696 0.08996 . Number 0.410601 0.224861 1.826 0.06785 . Start -0.206510 0.067699 -3.050 0.00229 ** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (Dispersion parameter for binomial family taken to be 1) Null deviance: 83.234 on 80 degrees of freedom Residual deviance: 61.380 on 77 degrees of freedom AIC: 69.38 Number of Fisher Scoring iterations: 5

plot(effect("Number", mod = m0), ylab="Probability", xlab="Number", main="")

plot(effect("Age", mod = m0), ylab="Probability", xlab="Age", main="")

plot(effect("Start", mod = m0), ylab="Probability", xlab="Start", main="")

summary(m2 <- gam(Kyphosis ~ s(Age) + Number + s(Start), data = dta6, family = "binomial"))

Family: binomial Link function: logit Formula: Kyphosis ~ s(Age) + Number + s(Start) Parametric coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -3.5926 1.1464 -3.134 0.00172 ** Number 0.3333 0.2324 1.434 0.15148 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Approximate significance of smooth terms: edf Ref.df Chi.sq p-value s(Age) 2.209 2.790 6.305 0.0795 . s(Start) 2.020 2.523 9.645 0.0146 * --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 R-sq.(adj) = 0.355 Deviance explained = 39.4% UBRE = -0.22384 Scale est. = 1 n = 81

anova(m2)

Family: binomial Link function: logit Formula: Kyphosis ~ s(Age) + Number + s(Start) Parametric Terms: df Chi.sq p-value Number 1 2.057 0.151 Approximate significance of smooth terms: edf Ref.df Chi.sq p-value s(Age) 2.209 2.790 6.305 0.0795 s(Start) 2.020 2.523 9.645 0.0146

vis.gam(m2, theta = 35, phi = 15, tick="detailed", xlab = "Start", ylab = "Age")

vis.gam(m2, theta = -35, phi = 15, tick="detailed", xlab = "Start", ylab = "Age")