example of regression with sampling weights
load packages
load data
## starting httpd help server ... done
seed the random number generator to get the same sample
pick 81 countries from three regions
# arrange the rows by alphabetical order
dta <- UN11 %>%
filter(region %in% c("Africa", "Asia", "Europe")) %>%
sample_n(81) %>%
arrange(region)
#從UN11裡選擇非洲、亞洲和歐洲三個區域裡的 81 個國家,依照區域排序,將它定義為dtafirst 6 lines of data frame
## region group fertility ppgdp lifeExpF pctUrban
## 1 Africa africa 3.99 1333 65.8 52
## 2 Africa africa 2.34 11451 78.0 56
## 3 Africa africa 3.19 12469 64.3 86
## 4 Africa africa 2.41 11321 77.9 78
## 5 Africa africa 5.08 741 58.7 42
## 6 Africa africa 5.75 520 57.0 27
data dimensions - rows and columns
## [1] 81 6
how many countries in each of the three regions
##
## Africa Asia Caribbean Europe Latin Amer
## 32 27 0 22 0
## North America
## 0
percentage of countries from each of the three regions selected
##
## Africa Asia Caribbean Europe Latin Amer
## 0.604 0.540 0.000 0.564 0.000
## North America
## 0.000
add the sampling weights variable to data
##
## 1.65625 1.77272727272727 1.85185185185185
## 32 22 27
simple regression
##
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2686 -0.7716 0.0497 0.6811 2.6292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.313 0.575 14.46 < 2e-16
## log(ppgdp) -0.652 0.068 -9.58 7.2e-15
##
## Residual standard error: 1.07 on 79 degrees of freedom
## Multiple R-squared: 0.537, Adjusted R-squared: 0.532
## F-statistic: 91.8 on 1 and 79 DF, p-value: 7.15e-15
##
## Call:
## lm(formula = fertility ~ ppgdp, data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.913 -1.106 -0.331 0.980 3.516
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.42e+00 1.87e-01 18.34 < 2e-16
## ppgdp -3.64e-05 7.62e-06 -4.77 8.3e-06
##
## Residual standard error: 1.39 on 79 degrees of freedom
## Multiple R-squared: 0.224, Adjusted R-squared: 0.214
## F-statistic: 22.7 on 1 and 79 DF, p-value: 8.32e-06
weighted regression
##
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta, weights = wt)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.031 -1.001 0.063 0.921 3.425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.210 0.577 14.22 < 2e-16
## log(ppgdp) -0.642 0.068 -9.44 1.3e-14
##
## Residual standard error: 1.42 on 79 degrees of freedom
## Multiple R-squared: 0.53, Adjusted R-squared: 0.524
## F-statistic: 89.1 on 1 and 79 DF, p-value: 1.35e-14
plot
#畫圖,以及兩條回歸線~以顏色來區分,X座標為GDP Y座標為生育率
ggplot(dta,
aes(log(ppgdp), fertility, label=region)) +
stat_smooth(method="lm", formula=y ~ x, se=F, col="peru", lwd=rel(.5)) +
stat_smooth(aes(weight=wt), method="lm", formula=y ~ x, se=F, lwd=rel(.5), col="gray")+
geom_text(check_overlap=TRUE, size=rel(2.3), aes(color=region))+
labs(x="GDP (US$ in log unit)",
y="Number of children per woman") +
theme_minimal() +
theme(legend.position="NONE") # The end