Dữ liệu ‘gapminder’ (trong package ‘gapminder’, Hans Rosling)

library(gapminder)
data(gapminder)
head(gapminder)
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
dim(gapminder)
## [1] 1704    6
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~country, data=gapminder)
Overall
(n=1704)
country
Afghanistan 12 (0.7%)
Albania 12 (0.7%)
Algeria 12 (0.7%)
Angola 12 (0.7%)
Argentina 12 (0.7%)
Australia 12 (0.7%)
Austria 12 (0.7%)
Bahrain 12 (0.7%)
Bangladesh 12 (0.7%)
Belgium 12 (0.7%)
Benin 12 (0.7%)
Bolivia 12 (0.7%)
Bosnia and Herzegovina 12 (0.7%)
Botswana 12 (0.7%)
Brazil 12 (0.7%)
Bulgaria 12 (0.7%)
Burkina Faso 12 (0.7%)
Burundi 12 (0.7%)
Cambodia 12 (0.7%)
Cameroon 12 (0.7%)
Canada 12 (0.7%)
Central African Republic 12 (0.7%)
Chad 12 (0.7%)
Chile 12 (0.7%)
China 12 (0.7%)
Colombia 12 (0.7%)
Comoros 12 (0.7%)
Congo, Dem. Rep. 12 (0.7%)
Congo, Rep. 12 (0.7%)
Costa Rica 12 (0.7%)
Cote d'Ivoire 12 (0.7%)
Croatia 12 (0.7%)
Cuba 12 (0.7%)
Czech Republic 12 (0.7%)
Denmark 12 (0.7%)
Djibouti 12 (0.7%)
Dominican Republic 12 (0.7%)
Ecuador 12 (0.7%)
Egypt 12 (0.7%)
El Salvador 12 (0.7%)
Equatorial Guinea 12 (0.7%)
Eritrea 12 (0.7%)
Ethiopia 12 (0.7%)
Finland 12 (0.7%)
France 12 (0.7%)
Gabon 12 (0.7%)
Gambia 12 (0.7%)
Germany 12 (0.7%)
Ghana 12 (0.7%)
Greece 12 (0.7%)
Guatemala 12 (0.7%)
Guinea 12 (0.7%)
Guinea-Bissau 12 (0.7%)
Haiti 12 (0.7%)
Honduras 12 (0.7%)
Hong Kong, China 12 (0.7%)
Hungary 12 (0.7%)
Iceland 12 (0.7%)
India 12 (0.7%)
Indonesia 12 (0.7%)
Iran 12 (0.7%)
Iraq 12 (0.7%)
Ireland 12 (0.7%)
Israel 12 (0.7%)
Italy 12 (0.7%)
Jamaica 12 (0.7%)
Japan 12 (0.7%)
Jordan 12 (0.7%)
Kenya 12 (0.7%)
Korea, Dem. Rep. 12 (0.7%)
Korea, Rep. 12 (0.7%)
Kuwait 12 (0.7%)
Lebanon 12 (0.7%)
Lesotho 12 (0.7%)
Liberia 12 (0.7%)
Libya 12 (0.7%)
Madagascar 12 (0.7%)
Malawi 12 (0.7%)
Malaysia 12 (0.7%)
Mali 12 (0.7%)
Mauritania 12 (0.7%)
Mauritius 12 (0.7%)
Mexico 12 (0.7%)
Mongolia 12 (0.7%)
Montenegro 12 (0.7%)
Morocco 12 (0.7%)
Mozambique 12 (0.7%)
Myanmar 12 (0.7%)
Namibia 12 (0.7%)
Nepal 12 (0.7%)
Netherlands 12 (0.7%)
New Zealand 12 (0.7%)
Nicaragua 12 (0.7%)
Niger 12 (0.7%)
Nigeria 12 (0.7%)
Norway 12 (0.7%)
Oman 12 (0.7%)
Pakistan 12 (0.7%)
Panama 12 (0.7%)
Paraguay 12 (0.7%)
Peru 12 (0.7%)
Philippines 12 (0.7%)
Poland 12 (0.7%)
Portugal 12 (0.7%)
Puerto Rico 12 (0.7%)
Reunion 12 (0.7%)
Romania 12 (0.7%)
Rwanda 12 (0.7%)
Sao Tome and Principe 12 (0.7%)
Saudi Arabia 12 (0.7%)
Senegal 12 (0.7%)
Serbia 12 (0.7%)
Sierra Leone 12 (0.7%)
Singapore 12 (0.7%)
Slovak Republic 12 (0.7%)
Slovenia 12 (0.7%)
Somalia 12 (0.7%)
South Africa 12 (0.7%)
Spain 12 (0.7%)
Sri Lanka 12 (0.7%)
Sudan 12 (0.7%)
Swaziland 12 (0.7%)
Sweden 12 (0.7%)
Switzerland 12 (0.7%)
Syria 12 (0.7%)
Taiwan 12 (0.7%)
Tanzania 12 (0.7%)
Thailand 12 (0.7%)
Togo 12 (0.7%)
Trinidad and Tobago 12 (0.7%)
Tunisia 12 (0.7%)
Turkey 12 (0.7%)
Uganda 12 (0.7%)
United Kingdom 12 (0.7%)
United States 12 (0.7%)
Uruguay 12 (0.7%)
Venezuela 12 (0.7%)
Vietnam 12 (0.7%)
West Bank and Gaza 12 (0.7%)
Yemen, Rep. 12 (0.7%)
Zambia 12 (0.7%)
Zimbabwe 12 (0.7%)
vn = subset(gapminder, country=="Vietnam")
vn
## # A tibble: 12 x 6
##    country continent  year lifeExp      pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Vietnam Asia       1952    40.4 26246839      605.
##  2 Vietnam Asia       1957    42.9 28998543      676.
##  3 Vietnam Asia       1962    45.4 33796140      772.
##  4 Vietnam Asia       1967    47.8 39463910      637.
##  5 Vietnam Asia       1972    50.3 44655014      700.
##  6 Vietnam Asia       1977    55.8 50533506      714.
##  7 Vietnam Asia       1982    58.8 56142181      707.
##  8 Vietnam Asia       1987    62.8 62826491      821.
##  9 Vietnam Asia       1992    67.7 69940728      989.
## 10 Vietnam Asia       1997    70.7 76048996     1386.
## 11 Vietnam Asia       2002    73.0 80908147     1764.
## 12 Vietnam Asia       2007    74.2 85262356     2442.
# Tuổi thọ trung bình ở người Việt 1952 - 2007
plot(lifeExp ~ year, pch=16, col="blue", data=vn)

# Tăng trưởng tuổi thọ trung bì ## Mô hình hồi qui tuyến tính: ## LE = 𝛼 + 𝛽×Year + 𝜖 - Triển khai bằng

m = lm(lifeExp ~ year, data=vn)
     summary(m)
## 
## Call:
## lm(formula = lifeExp ~ year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1884 -0.5840  0.1335  0.7396  1.7873 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.272e+03  4.349e+01  -29.25 5.10e-11 ***
## year         6.716e-01  2.197e-02   30.57 3.29e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.314 on 10 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9884 
## F-statistic: 934.5 on 1 and 10 DF,  p-value: 3.289e-11

Phương trình mô tả xu hướng tăng tuổi thọ:

LE = 1272 + 0.672 x Year

Tuổi thọ người Việt trong 55 qua tăng trung bình 0.67 / năm.

Mô hình tiên lượng thích hợp?

library(visreg)
visreg(m)

# Hoán chuyển đơn vị X # Phương trình mô tả xu hướng tăng tuổi thọ: ## LE = 1272 + 0.672 x Year # Chú ý hằng số intercept = 1272. Ý nghĩa là nếu năm = 0 thì tuổi thọ trung bình là 1272! ## Vô nghĩa!

vn$Year = vn$year - 1952
m = lm(lifeExp ~ Year, data=vn); summary(m)
## 
## Call:
## lm(formula = lifeExp ~ Year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1884 -0.5840  0.1335  0.7396  1.7873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.01008    0.71334   54.69 1.01e-13 ***
## Year         0.67162    0.02197   30.57 3.29e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.314 on 10 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9884 
## F-statistic: 934.5 on 1 and 10 DF,  p-value: 3.289e-11

Ý nghĩa hằng số intercept khi Year = 0

Phương trình mô tả xu hướng tăng tuổi thọ (đơn vị gốc year = 1952, 1957, …, 2007:

LE = 1272 + 0.672 x Year (1)

Phương trình mô tả xu hướng tăng tuổi thọ (đơn vị scale Year = 0, 5, 10, …, 55

LE = 39 + 0.672 x Year (2)

Phương trình (2) cho thấy tại năm 1952 (Year = 0), tuổi thọ trung bình là 39.

Thu nhập bình quân VN 1952 – 2007

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

plot(gdpPercap ~ year, pch=16, col="blue", data=vn)

m = lm(gdpPercap ~ year, data=vn) 
summary(m)
## 
## Call:
## lm(formula = gdpPercap ~ year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -387.87 -267.13  -69.85  207.79  723.69 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -49382.260  11587.193  -4.262  0.00166 **
## year            25.461      5.853   4.350  0.00144 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 350 on 10 degrees of freedom
## Multiple R-squared:  0.6542, Adjusted R-squared:  0.6196 
## F-statistic: 18.92 on 1 and 10 DF,  p-value: 0.001444

Mô hình tiên lượng thu nhập bình quân thích hợp?

m = lm(gdpPercap ~ year, data=vn)
visreg(m)

Mô hình cho tăng trưởng thu nhập bình quân

m = lm(gdpPercap ~ year + I(year^2), data=vn)
summary(m)
## 
## Call:
## lm(formula = gdpPercap ~ year + I(year^2), data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -212.84  -95.87  -24.78  103.39  223.31 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.228e+06  6.868e+05   6.156 0.000167 ***
## year        -4.297e+03  6.939e+02  -6.192 0.000160 ***
## I(year^2)    1.092e+00  1.753e-01   6.228 0.000154 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160.1 on 9 degrees of freedom
## Multiple R-squared:  0.9349, Adjusted R-squared:  0.9204 
## F-statistic: 64.61 on 2 and 9 DF,  p-value: 4.587e-06

Phương trình mô tả xu hướng tăng trưởng thu nhập bình quân

Income = 4228000 - 4297 x Year + 1.092 x (Year)2

# Mô hình tiên lượng thu nhập bình quân thích hợp?
m = lm(gdpPercap ~ year + I(year^2), data=vn) 
visreg(m)

vn$pred = predict(m)
vn
## # A tibble: 12 x 8
##    country continent  year lifeExp      pop gdpPercap  Year  pred
##    <fct>   <fct>     <int>   <dbl>    <int>     <dbl> <dbl> <dbl>
##  1 Vietnam Asia       1952    40.4 26246839      605.     0  818.
##  2 Vietnam Asia       1957    42.9 28998543      676.     5  672.
##  3 Vietnam Asia       1962    45.4 33796140      772.    10  581.
##  4 Vietnam Asia       1967    47.8 39463910      637.    15  545.
##  5 Vietnam Asia       1972    50.3 44655014      700.    20  563.
##  6 Vietnam Asia       1977    55.8 50533506      714.    25  636.
##  7 Vietnam Asia       1982    58.8 56142181      707.    30  763.
##  8 Vietnam Asia       1987    62.8 62826491      821.    35  945.
##  9 Vietnam Asia       1992    67.7 69940728      989.    40 1181.
## 10 Vietnam Asia       1997    70.7 76048996     1386.    45 1472.
## 11 Vietnam Asia       2002    73.0 80908147     1764.    50 1818.
## 12 Vietnam Asia       2007    74.2 85262356     2442.    55 2218.

Mô hình “spine” cho tăng trưởng thu nhập

Mô hình hồi qui tuyến tính spline:

I=𝛼+𝛽×Year+𝛾×Year−1992 +𝜖

  • Triển khai bằng R
vn$knot = ifelse(vn$year>1992, 1, 0) 
vn$diff = vn$year - 1992
vn$int = vn$diff * vn$knot
m = lm(gdpPercap ~ year + int, data=vn) 
summary(m)
## 
## Call:
## lm(formula = gdpPercap ~ year + int, data = vn)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -106.291  -54.225   -5.144   41.823  133.329 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11328.061   3963.982  -2.858   0.0188 *  
## year             6.116      2.009   3.044   0.0139 *  
## int             95.389      7.244  13.168 3.48e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.95 on 9 degrees of freedom
## Multiple R-squared:  0.9829, Adjusted R-squared:  0.9791 
## F-statistic: 259.2 on 2 and 9 DF,  p-value: 1.107e-08

Từ 1955 đến 1992, thu nhập bình quân tăng 6 USD/năm, nhưng từ 1992 trở đi, thu nhập tăng 95 USD / năm.

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(vn, x=~year, y=~gdpPercap, type="scatter")%>% add_lines(x = ~year, y = fitted(m))
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.