Use R to Analyze Data ‘savings’

Savings rates in 50 countries

The savings data frame has 50 rows and 5 columns. The data is averaged over the period 1960-1970. This data frame contains the following columns:

library(faraway)

fullmodel=lm(sr~pop15+pop75+dpi+ddpi, data=savings) 
summary(fullmodel)
## 
## Call:
## lm(formula = sr ~ pop15 + pop75 + dpi + ddpi, data = savings)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.242 -2.686 -0.249  2.428  9.751 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 28.566087   7.354516    3.88  0.00033 ***
## pop15       -0.461193   0.144642   -3.19  0.00260 ** 
## pop75       -1.691498   1.083599   -1.56  0.12553    
## dpi         -0.000337   0.000931   -0.36  0.71917    
## ddpi         0.409695   0.196197    2.09  0.04247 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.8 on 45 degrees of freedom
## Multiple R-squared:  0.338,  Adjusted R-squared:  0.28 
## F-statistic: 5.76 on 4 and 45 DF,  p-value: 0.00079

Compare two nested models via F-test

reducedmodel=lm(sr~dpi+ddpi, data=savings)
#summary(reducedmodel)
anova(reducedmodel, fullmodel)
## Analysis of Variance Table
## 
## Model 1: sr ~ dpi + ddpi
## Model 2: sr ~ pop15 + pop75 + dpi + ddpi
##   Res.Df RSS Df Sum of Sq    F Pr(>F)   
## 1     47 825                            
## 2     45 651  2       174 6.02 0.0048 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s check how the F-stat is computed.

rss.full=sum(fullmodel$res^2)
# rss.full=deviance(fullmodel) # you can use "deviance" to extract RSS
rss.reduced=sum(reducedmodel$res^2)
#rss.reduced = deviance(reducedmodel)
Fstat=(rss.reduced-rss.full)/2/(rss.full/45)
1-pf(Fstat, 2, 45)
## [1] 0.004835

H0: sr ~ pop.mid + dpi + ddpi (model I)

Ha: sr ~ pop15 + pop75 + dpi + ddpi (model II)

Although model I seems to contain some new variable, pop.mid, it is contained in model II, in the sense that any linear combination of the three variables, as well as the intercept, in model I can be re-written as a linear combination of the four variables, as well as the intercept, in model II. On the other hand, not every linear combination of variables in model II can be written as a linear combination of variables in model I. That is, model II is larger than model I and contains model I as a special case.

reducemodel=lm(sr ~ I(1-pop15-pop75) + dpi + ddpi, data=savings)
anova(reducemodel, fullmodel)
## Analysis of Variance Table
## 
## Model 1: sr ~ I(1 - pop15 - pop75) + dpi + ddpi
## Model 2: sr ~ pop15 + pop75 + dpi + ddpi
##   Res.Df RSS Df Sum of Sq    F Pr(>F)
## 1     46 674                         
## 2     45 651  1      22.9 1.58   0.21
tmpmodel=lm(sr~pop75+dpi+ddpi,data=savings)
anova(tmpmodel, fullmodel)
## Analysis of Variance Table
## 
## Model 1: sr ~ pop75 + dpi + ddpi
## Model 2: sr ~ pop15 + pop75 + dpi + ddpi
##   Res.Df RSS Df Sum of Sq    F Pr(>F)   
## 1     46 798                            
## 2     45 651  1       147 10.2 0.0026 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fullmodel)$coef
##               Estimate Std. Error t value  Pr(>|t|)
## (Intercept) 28.5660865  7.3545161  3.8842 0.0003338
## pop15       -0.4611931  0.1446422 -3.1885 0.0026030
## pop75       -1.6914977  1.0835989 -1.5610 0.1255298
## dpi         -0.0003369  0.0009311 -0.3618 0.7191732
## ddpi         0.4096949  0.1961971  2.0882 0.0424711
summary(fullmodel)$coef[2,3]^2  # t-stat^2 = F-stat
## [1] 10.17

LS Coefficient.

The LS coefficient beta_j describes the partial correlation between Y and X_j adjusted for the other predictors.

beta_j from MLR can be estimated via the following steps:

  1. regress Y onto all other predictors except X_j;

  2. regress X_j onto all other predictors except X_j;

  3. regress the residual from (1) onto the residual from (2).

new.y=lm(sr~pop15+pop75+dpi, data=savings)$res
new.ddpi = lm(ddpi~pop15+pop75+dpi, data=savings)$res 
parmodel=lm(new.y ~ new.ddpi)
parmodel$coef
## (Intercept)    new.ddpi 
##  -4.389e-17   4.097e-01
fullmodel$coef
## (Intercept)       pop15       pop75         dpi        ddpi 
##  28.5660865  -0.4611931  -1.6914977  -0.0003369   0.4096949

Permutation Test.

When the normal assumption is questionable, we can use permutation test.

Suppose our null hypothesis is H0: sr ~ dpi + ddpi and Ha: sr + pop15 + pop75 + dpi + ddpi.

Let’s use F-stat from the full model as the test statistic: If pop15 and pop75 are indeed relevant to sr, we would expect the F-stat to be large. So large F-stat (from the full model) supports Ha (i.e., we know the rejection region).

Now the goal is to find the distribution of the F-stat under the null hypothesis (without using any normal assumption). How can we get a data set, which also involves 50 countries with all the four variables, and which looks like a data set from H0?

Recall that under H0, pop15 and pop75 are irrelevant. In our data, we have 50 pairs of (pop15, pop75), each corresponding to a county. What if we randomly assign a pair of (pop15, pop75) associated with one country to another country (repeat this for all 50 pairs), and then re-fit the full model? The inference based on this new data set shouldn’t be affected that much, if (pop15, pop75) is truely irrelevant to sr. This is the idea behind the permutation test.

n.iter = 2000; 
fstats = numeric(n.iter);
for(i in 1:n.iter){
   newsavings=savings;
    newsavings[,c(2,3)]=savings[sample(50),c(2,3)];
    ge = lm(sr ~., data=newsavings);
    fstats[i] = summary(ge)$fstat[1]
}
length(fstats[fstats > summary(fullmodel)$fstat[1]])/n.iter
## [1] 0.0065
sample(1:5)
## [1] 4 5 1 3 2
sample(1:5)
## [1] 5 2 4 3 1

Confidence/Prediction Interval, Confidence Region

Use the command “confint” to obtain confidence intervals for regression coefficients.

confint(fullmodel)
##                 2.5 %    97.5 %
## (Intercept) 13.753331 43.378842
## pop15       -0.752518 -0.169869
## pop75       -3.873978  0.490983
## dpi         -0.002212  0.001538
## ddpi         0.014534  0.804856
confint(fullmodel, 'pop15', level=0.99)
##         0.5 %   99.5 %
## pop15 -0.8502 -0.07217

Use the command “ellipse” from the ellipse package to obtain confidence regions. Here we plot the confidence region for (pop15, pop75), which is an ellipsoid centered at the LS estimate of their coefficients.

Recall that we tested whether their LS coefficients are zero using the F-test earlier. Now, using the duality of Confidence region/interval and hypothesis test, we reject the hypothesis based on whether the point (0,0) is inside the ellipsoid or not.

library(ellipse)
library(ggplot2)

CR95 = ellipse(fullmodel, c(2,3))
CR99 = ellipse(fullmodel, c(2,3), level=0.99)
CR998 = ellipse(fullmodel, c(2,3), level=0.998)
dim(CR95)
## [1] 100   2
head(CR95)
##        pop15  pop75
## [1,] -0.1172 0.8857
## [2,] -0.1258 0.9401
## [3,] -0.1358 0.9839
## [4,] -0.1471 1.0170
## [5,] -0.1597 1.0391
## [6,] -0.1735 1.0502
myCR = rbind(CR95, CR99, CR998);
myCR = data.frame(myCR); 
names(myCR) = c("pop15","pop75"); 
myCR[, 'level']=as.factor(c(rep(0.95, dim(CR95)[1]), 
                              rep(0.99, dim(CR99)[1]), 
                              rep(0.998, dim(CR998)[1])));


ggplot(data=myCR, aes(x=pop15, y=pop75, colour=level)) + 
  geom_path(aes(linetype=level), size=1.5) + 
  geom_point(x=coef(fullmodel)[2], y=coef(fullmodel)[3], shape=3, size=3, colour='red') + 
  geom_point(x=0, y=0, shape=1, size=3, colour='red') 

plot of chunk unnamed-chunk-9

Let’s find the 95 percent confidence interval (CI) and prediction interval (PI) at the mean value of the predictors.

# create a data frame on which you’d like to predict
meanvalue=apply(savings[,2:5],2,mean)
meanvalue
##    pop15    pop75      dpi     ddpi 
##   35.090    2.293 1106.758    3.758
x=data.frame(t(meanvalue))
predict.lm(fullmodel,x,interval="confidence")
##     fit   lwr   upr
## 1 9.671 8.588 10.75
predict.lm(fullmodel,x,interval="prediction")
##     fit   lwr   upr
## 1 9.671 1.936 17.41

CIs or PIs get wider as we move away from the training data. Next we display the 95 CIs/PIs for the fullmodel when we vary “ddpi”, while holding the other predictors fixed at the meanvalue.

# First create the data frame on which you need to predict
# range(savings$ddpi)
#[1]  0.22 16.71
x=matrix(meanvalue[1:3], 20, 3, byrow=TRUE)
colnames(x)=c("pop15", "pop75", "dpi")
x=data.frame(x)
# head(x)
x[, 'ddpi']=0:19; 
# head(x)

myCI=predict.lm(fullmodel,x,interval="confidence", level=0.90);
myPI=predict.lm(fullmodel,x,interval="prediction", level=0.90);


# Form the data frame for plotting
ggplot(data=NULL, aes(x=0:19)) + 
  geom_line(aes(y=myCI[,1], colour="LSfit"), size=1) + 
  geom_line(aes(y=myCI[,2], colour="90% CI"), size=1) +
  geom_line(aes(y=myCI[,3], colour="90% CI"), size=1) +
  geom_line(aes(y=myPI[,2], colour="90% PI"), size=1, linetype=2) +
  geom_line(aes(y=myPI[,3], colour="90% PI"), size=1, linetype=2) + 
  scale_colour_manual("", values=c("LSfit" = "black",
                               "90% CI" = "blue", 
                               "90% PI"="red")) + 
  xlab("ddpi") +ylab("saving rate") + 
  geom_vline(xintercept = mean(savings$ddpi), colour="purple", size=1, linetype=3)

plot of chunk unnamed-chunk-11