Q1
pacman::p_load(HistData)
library(HistData)
example(Galton)
##
## Galton> ## Don't show:
## Galton> # allow to work with car 1
## Galton> if (packageDescription("car")[["Version"]] < 2) {
## Galton+ dataEllipse <- data.ellipse
## Galton+ }
##
## Galton> ## End(Don't show)
## Galton> data(Galton)
##
## Galton> ###########################################################################
## Galton> # sunflower plot with regression line and data ellipses and lowess smooth
## Galton> ###########################################################################
## Galton>
## Galton> with(Galton,
## Galton+ {
## Galton+ sunflowerplot(parent,child, xlim=c(62,74), ylim=c(62,74))
## Galton+ reg <- lm(child ~ parent)
## Galton+ abline(reg)
## Galton+ lines(lowess(parent, child), col="blue", lwd=2)
## Galton+ if(require(car)) {
## Galton+ dataEllipse(parent,child, xlim=c(62,74), ylim=c(62,74), plot.points=FALSE)
## Galton+ }
## Galton+ })
## Loading required package: car

m0 <- lm(child ~ 1, data = Galton); m0
##
## Call:
## lm(formula = child ~ 1, data = Galton)
##
## Coefficients:
## (Intercept)
## 68.09
m1 <- update(m0, . ~ . + parent)
summary(m1)
##
## Call:
## lm(formula = child ~ parent, data = Galton)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8050 -1.3661 0.0487 1.6339 5.9264
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.94153 2.81088 8.517 <2e-16 ***
## parent 0.64629 0.04114 15.711 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.239 on 926 degrees of freedom
## Multiple R-squared: 0.2105, Adjusted R-squared: 0.2096
## F-statistic: 246.8 on 1 and 926 DF, p-value: < 2.2e-16
anova(m0, m1)
## Analysis of Variance Table
##
## Model 1: child ~ 1
## Model 2: child ~ parent
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 5877.2
## 2 926 4640.3 1 1236.9 246.84 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pacman::p_load(alr4)
library(alr4)
?Heights
## starting httpd help server ...
## done
# Heights in inches of mothers and their daughters
ma <- lm(dheight ~ mheight, data = Heights) ; ma
##
## Call:
## lm(formula = dheight ~ mheight, data = Heights)
##
## Coefficients:
## (Intercept) mheight
## 29.9174 0.5417
summary(ma)
##
## Call:
## lm(formula = dheight ~ mheight, data = Heights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.397 -1.529 0.036 1.492 9.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.91744 1.62247 18.44 <2e-16 ***
## mheight 0.54175 0.02596 20.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared: 0.2408, Adjusted R-squared: 0.2402
## F-statistic: 435.5 on 1 and 1373 DF, p-value: < 2.2e-16
require(ggplot2)
## Loading required package: ggplot2
ggplot(ma, aes(sample = scale(resid(ma, "pearson")))) +
stat_qq() +
geom_abline(intercept = 0, slope = 1)+
labs(x = "Normal quantiles",
y = "Quantiles of standardized residuals")

Q2
#
# example of regression with sampling weights
#
# set some options
options(digits = 3, show.signif.stars = FALSE)
# set path to working directory
setwd("C:\\Users\\USER\\Desktop\\多層次\\w3 exercise")
# load packages
pacman::p_load(tidyverse)
# seed the random number generator to get the same sample
set.seed(6102)
require(dplyr)
# pick 72 countries from three regions
# arrange the rows by alphabetical order
dta <- UN11 %>%
filter(region %in% c("Africa", "Asia", "Europe")) %>%
sample_n(72) %>%
arrange(region)
# first 6 lines of data frame
head(dta)
## region group fertility ppgdp lifeExpF pctUrban
## 1 Africa africa 2.64 2654 75.5 44
## 2 Africa africa 4.74 737 63.2 28
## 3 Africa africa 2.38 7255 54.1 62
## 4 Africa africa 4.36 1131 61.0 42
## 5 Africa africa 4.62 802 59.2 23
## 6 Africa africa 4.98 16852 52.9 40
# data dimensions - rows and columns
dim(dta)
## [1] 72 6
# how many countries in each of the three regions
R3 <- table(dta$region)
# percentage of countries from each of the three regions selected
w <- R3/table(UN11$region)
# add the sampling weights variable to data
# skip over countries in regions not selected
dta$wt <- rep(1/w[w != 0], R3[R3 != 0])
# simple regression
m0 <- lm(fertility ~ log(ppgdp), data = dta)
# weighted regression
m1 <- update(m0, weights = wt)
# plot
ggplot(dta, aes(log(ppgdp), fertility, group = region, label = region)) +
geom_text(check_overlap = TRUE, size = rel(2), col = "gray30")+
stat_smooth(aes(group = 1), method = "lm", se = F, col = "gray30") +
stat_smooth(method = "lm", se = F)+
geom_abline(intercept = coef(m1)[1], slope = coef(m1)[2], col = "firebrick") +
labs(x = "GDP (US$ in log units)", y = "Number of children per woman") +
theme_bw()
