1 Hw exercise 1

example of regression with sampling weights

# set some options
options(digits=3, show.signif.stars=FALSE)

load packages

#載入packages
pacman::p_load(alr4, tidyverse)

load data

#載入UN11這筆資料
data(UN11, package="alr4")
#自己另外看這筆資料在處理什麼
?UN11

## starting httpd help server ... done

seed the random number generator to get the same sample

#取個亂數種子
set.seed(6102)

pick 81 countries from three regions

# arrange the rows by alphabetical order
dta <- UN11 %>%
       filter(region %in% c("Africa", "Asia", "Europe")) %>%
       sample_n(81) %>%
       arrange(region)
#從UN11裡選擇非洲、亞洲和歐洲三個區域裡的 81 個國家，依照區域排序，將它定義為dta

first 6 lines of data frame

#看前6項
head(dta)

##   region  group fertility ppgdp lifeExpF pctUrban
## 1 Africa africa      3.99  1333     65.8       52
## 2 Africa africa      2.34 11451     78.0       56
## 3 Africa africa      3.19 12469     64.3       86
## 4 Africa africa      2.41 11321     77.9       78
## 5 Africa africa      5.08   741     58.7       42
## 6 Africa africa      5.75   520     57.0       27

data dimensions - rows and columns

#資料的維度
dim(dta)

## [1] 81  6

how many countries in each of the three regions

#以表顯示我們的資料集dta的區域分別有多少個國家，將它定義為R3
R3 <- table(dta$region)
head(R3)

## 
##        Africa          Asia     Caribbean        Europe    Latin Amer 
##            32            27             0            22             0 
## North America 
##             0

percentage of countries from each of the three regions selected

#在R3從UN11的"子集dta"裡選出的國家中，分別佔了整個UN11這個資料中區域的多少百分比~
w <- R3/table(UN11$region)
head(w)

## 
##        Africa          Asia     Caribbean        Europe    Latin Amer 
##         0.604         0.540         0.000         0.564         0.000 
## North America 
##         0.000

add the sampling weights variable to data

# skip over countries in regions not selected
dta$wt <- rep(1/w[w != 0], R3[R3 != 0])
table(dta$wt)

## 
##          1.65625 1.77272727272727 1.85185185185185 
##               32               22               27

#增加權重，且排除未被選取的國家的區域

simple regression

#簡單線性回歸>摘要他
#生育率與GDP的回歸，GDP取log
summary(m0 <- lm(fertility ~ log(ppgdp), data=dta))

## 
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2686 -0.7716  0.0497  0.6811  2.6292 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    8.313      0.575   14.46  < 2e-16
## log(ppgdp)    -0.652      0.068   -9.58  7.2e-15
## 
## Residual standard error: 1.07 on 79 degrees of freedom
## Multiple R-squared:  0.537,  Adjusted R-squared:  0.532 
## F-statistic: 91.8 on 1 and 79 DF,  p-value: 7.15e-15

#未取log會產生以下情形，但不瞭解取自然對數的差別以及為何要取?
summary(m2 <- lm(fertility ~ ppgdp, data=dta))

## 
## Call:
## lm(formula = fertility ~ ppgdp, data = dta)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.913 -1.106 -0.331  0.980  3.516 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  3.42e+00   1.87e-01   18.34  < 2e-16
## ppgdp       -3.64e-05   7.62e-06   -4.77  8.3e-06
## 
## Residual standard error: 1.39 on 79 degrees of freedom
## Multiple R-squared:  0.224,  Adjusted R-squared:  0.214 
## F-statistic: 22.7 on 1 and 79 DF,  p-value: 8.32e-06

weighted regression

#加權回歸
summary(m1 <- update(m0, weights=wt))

## 
## Call:
## lm(formula = fertility ~ log(ppgdp), data = dta, weights = wt)
## 
## Weighted Residuals:
##    Min     1Q Median     3Q    Max 
## -3.031 -1.001  0.063  0.921  3.425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    8.210      0.577   14.22  < 2e-16
## log(ppgdp)    -0.642      0.068   -9.44  1.3e-14
## 
## Residual standard error: 1.42 on 79 degrees of freedom
## Multiple R-squared:  0.53,   Adjusted R-squared:  0.524 
## F-statistic: 89.1 on 1 and 79 DF,  p-value: 1.35e-14

plot

#畫圖，以及兩條回歸線~以顏色來區分，X座標為GDP Y座標為生育率
ggplot(dta, 
       aes(log(ppgdp), fertility, label=region)) +
 stat_smooth(method="lm", formula=y ~ x, se=F, col="peru", lwd=rel(.5)) +
 stat_smooth(aes(weight=wt), method="lm", formula=y ~ x, se=F, lwd=rel(.5), col="gray")+
 geom_text(check_overlap=TRUE, size=rel(2.3), aes(color=region))+
 labs(x="GDP (US$ in log unit)", 
      y="Number of children per woman") +
 theme_minimal() +
 theme(legend.position="NONE")

# The end

Hw Exercise1

tjlee

2020-09-28

1 Hw exercise 1