gapminder data set from R
str(gapminder)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1704 obs. of 6 variables:
$ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
$ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num 28.8 30.3 32 34 36.1 ...
$ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num 779 821 853 836 740 ...
summary(gapminder)
country continent year lifeExp pop gdpPercap
Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60 Min. :6.001e+04 Min. : 241.2
Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20 1st Qu.:2.794e+06 1st Qu.: 1202.1
Algeria : 12 Asia :396 Median :1980 Median :60.71 Median :7.024e+06 Median : 3531.8
Angola : 12 Europe :360 Mean :1980 Mean :59.47 Mean :2.960e+07 Mean : 7215.3
Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
Australia : 12 Max. :2007 Max. :82.60 Max. :1.319e+09 Max. :113523.1
(Other) :1632
GDP per capita does not appear to be normally distributed.
hist(gapminder$lifeExp)
hist(gapminder$gdpPercap)
The relationship between GDP per capita and life expectancy doe not appear to be linear.
plot(gapminder$gdpPercap, gapminder$lifeExp, col="red")
abline(lm(gapminder$lifeExp ~ gapminder$gdpPercap), col="blue")
mgdp=mean(gapminder$gdpPercap)
gapminder$gdp_c=gapminder$gdpPercap-mgdp
hist(gapminder$gdp_c)
GDP per capita is significantly associated with life expectancy. Curve does not fit well. Only 34 % of the variance is explianed by the model
reg1 <-lm(gapminder$lifeExp ~ gapminder$gdp_c)
summary(reg1)
Call:
lm(formula = gapminder$lifeExp ~ gapminder$gdp_c)
Residuals:
Min 1Q Median 3Q Max
-82.754 -7.758 2.176 8.225 18.426
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.947e+01 2.542e-01 234.01 <2e-16 ***
gapminder$gdp_c 7.649e-04 2.579e-05 29.66 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.49 on 1702 degrees of freedom
Multiple R-squared: 0.3407, Adjusted R-squared: 0.3403
F-statistic: 879.6 on 1 and 1702 DF, p-value: < 2.2e-16
confint(reg1)
2.5 % 97.5 %
(Intercept) 5.897595e+01 5.997292e+01
gapminder$gdp_c 7.142984e-04 8.154669e-04
gapminder$gdpvalues <- seq(4000, 114000, length.out = 1704)
gapminder$predictedvalues1 <- predict(reg1, list(gdp_v=gapminder$gdpvalues, gdp_v2=gapminder$gdpvalues*gapminder$gdpvalues))
plot(gapminder$gdpPercap, gapminder$predictedvalues1, pch=16, col = "red" )
GDP per capita is significantly associated with life expectancy. Curve fits relatively well. Only 53 % of the variance is explianed by the model.
summary(reg2)
Call:
lm(formula = gapminder$lifeExp ~ gapminder$gdp_c + I(gapminder$gdp_c^2))
Residuals:
Min 1Q Median 3Q Max
-28.0600 -6.4253 0.2611 7.0889 27.1752
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.093e+01 2.225e-01 273.88 <2e-16 ***
gapminder$gdp_c 1.334e-03 3.098e-05 43.07 <2e-16 ***
I(gapminder$gdp_c^2) -1.502e-08 5.794e-10 -25.92 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.885 on 1701 degrees of freedom
Multiple R-squared: 0.5274, Adjusted R-squared: 0.5268
F-statistic: 949.1 on 2 and 1701 DF, p-value: < 2.2e-16
plot(gapminder$gdpPercap, gapminder$lifeExp, type="p", lwd=3)
poly2 <- function(x) reg2$coefficient[3]*x^2 + reg2$coefficient[2]*x + reg2$coefficient[1]
curve(poly2, col="red", lwd=2)
points(gapminder$gdpPercap, gapminder$lifeExp, type="p", lwd=3)
gapminder$gdpvalues <- seq(4000, 114000, length.out = 1704)
gapminder$predictedvalues2 <- predict(reg2, list(gdp_v=gapminder$gdpvalues, gdp_v2=gapminder$gdpvalues*gapminder$gdpvalues))
plot(gapminder$gdpPercap, gapminder$predictedvalues2, pch=16, col = "red" )
GDP per capita is significantly associated with life expectancy. Curve does not fit well. Only 60 % of the variance is explianed by the model.
summary(reg3)
Call:
lm(formula = gapminder$lifeExp ~ gapminder$gdp_c + I(gapminder$gdp_c^2) +
I(gapminder$gdp_c^3))
Residuals:
Min 1Q Median 3Q Max
-27.0491 -4.9994 0.1297 5.8643 26.2220
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.373e+01 2.577e-01 247.30 <2e-16 ***
gapminder$gdp_c 1.803e-03 3.878e-05 46.49 <2e-16 ***
I(gapminder$gdp_c^2) -5.929e-08 2.545e-09 -23.30 <2e-16 ***
I(gapminder$gdp_c^3) 4.093e-13 2.301e-14 17.79 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.161 on 1700 degrees of freedom
Multiple R-squared: 0.6016, Adjusted R-squared: 0.6009
F-statistic: 855.6 on 3 and 1700 DF, p-value: < 2.2e-16
confint(reg3)
2.5 % 97.5 %
(Intercept) 6.322037e+01 6.423121e+01
gapminder$gdp_c 1.726987e-03 1.879115e-03
I(gapminder$gdp_c^2) -6.428661e-08 -5.430236e-08
I(gapminder$gdp_c^3) 3.641838e-13 4.544460e-13
plot(gapminder$gdpPercap, gapminder$lifeExp, type="p", lwd=3)
poly3 <- function(x) reg3$coefficient[3]*x^2 + reg3$coefficient[2]*x + reg3$coefficient[1]
curve(poly3, col="red", lwd=2)
points(gapminder$gdpPercap, gapminder$lifeExp, type="p", lwd=3)
gapminder$gdpvalues <- seq(4000, 114000, length.out = 1704)
gapminder$predictedvalues3 <- predict(reg3, list(gdp_v=gapminder$gdpvalues, gdp_v2=gapminder$gdpvalues*gapminder$gdpvalues))
plot(gapminder$gdpPercap, gapminder$predictedvalues3, pch=16, col = "red" )
anova(reg1, reg2)
Analysis of Variance Table
Model 1: gapminder$lifeExp ~ gapminder$gdp_c
Model 2: gapminder$lifeExp ~ gapminder$gdp_c + I(gapminder$gdp_c^2)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1702 187335
2 1701 134289 1 53046 671.92 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
anova(reg2, reg3)
Analysis of Variance Table
Model 1: gapminder$lifeExp ~ gapminder$gdp_c + I(gapminder$gdp_c^2)
Model 2: gapminder$lifeExp ~ gapminder$gdp_c + I(gapminder$gdp_c^2) +
I(gapminder$gdp_c^3)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 1701 134289
2 1700 113216 1 21073 316.43 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
influence.measures(reg3)
Influence measures of
lm(formula = gapminder$lifeExp ~ gapminder$gdp_c + I(gapminder$gdp_c^2) + I(gapminder$gdp_c^3)) :
%let path=/folders/myshortcuts/datasets;
libname usr "&path";
proc import datafile="&path/gapminder_r.csv" dbms=csv out=gapminder_r replace;
getnames=yes;
run;
proc sgplot data=gapminder_r;
reg x=gdppercap y=lifeexp / lineattrs=(color=blue thickness=2) clm;
yaxis label="Life expectancy";
xaxis label="GDP per capita";
run;
scatterplot/linear
proc sgplot data=gapminder_r;
reg x=gdppercap y=lifeexp / lineattrs=(color=green thickness=2) degree=2 clm;
yaxis label="Life expectancy";
xaxis label="GDP per capita";
run;
Scatterplot/quadratic
proc sgplot data=gapminder_r;
reg x=gdppercap y=lifeexp / lineattrs=(color=green thickness=2) degree=3clm;
yaxis label="Life expectancy";
xaxis label="GDP per capita";
run;
scatterplot/cubic
proc sgplot data=gapminder_r;
reg x=gdppercap y=lifeexp / lineattrs=(color=blue thickness=2) degree=1 clm;
reg x=gdppercap y=lifeexp / lineattrs=(color=green thickness=2) degree=2 clm;
reg x=gdppercap y=lifeexp / lineattrs=(color=green thickness=2) degree=3clm;
yaxis label="Life expectancy";
xaxis label="GDP per capita";
run;
scatterplot/degrees1,2&3
proc means data=gapminder_r;
var lifeexp gdppercap;
run;
data gapminder_c;
set gapminder_r;
if lifeexp ne . and gdppercap ne . ;
gdppercap_c=gdppercap-7215.33;
lifeexp_c=lifeexp-59.4744;
run;
proc means data=gapminder_c;
var lifeexp_c gdppercap_c;
run;
centering_variables
PROC glm data=gapminder_c;
model lifeexp=gdppercap_c/solution clparm;
run;
linear regression
PROC glm data=gapminder_c;
model lifeexp=gdppercap_c gdppercap_c*gdppercap_c/solution clparm;
run;
polynomial/quadratic regression
PROC glm data=gapminder_c;
model lifeexp=gdppercap_c gdppercap_c*gdppercap_c gdppercap_c*gdppercap_c*gdppercap_c/solution clparm;
run;
polynomial/cubic regression
data gapminder_c;
set gapminder_c;
gdppercap_c2 = gdppercap_c*gdppercap_c;
gdppercap_c3 = gdppercap_c*gdppercap_c*gdppercap_c;
run;
proc reg data=gapminder_c alpha=0.05
plots(only)=(diagnostics residuals fitplot observedbypredicted RStudentByLeverage(label) CooksD(label)
Residuals(smooth) DFFITS(label) DFBETAS ObservedByPredicted(label));
model lifeexp=gdppercap_c;
run;
quit;
proc reg data=gapminder_c alpha=0.05
plots(only)=(diagnostics residuals fitplot observedbypredicted RStudentByLeverage(label) CooksD(label)
Residuals(smooth) DFFITS(label) DFBETAS ObservedByPredicted(label));
model lifeexp=gdppercap_c gdppercap_c2;
run;
quit;
proc reg data=gapminder_c alpha=0.05
plots(only)=(diagnostics residuals fitplot observedbypredicted RStudentByLeverage(label) CooksD(label)
Residuals(smooth) DFFITS(label) DFBETAS ObservedByPredicted(label));
model lifeexp=gdppercap_c gdppercap_c2 gdppercap_c3 /;
run;
quit;
proc_reg/cubic
image
image
image
image
image
image
image