NORTHLANDS - REGRESSING ON SMOOTHED POPULATION

EXPLORING THE DATA

Reading:

options(width = 320)
northlands = read.csv("Canada.csv")[, 4:8]
head(northlands)
##   SmoothPop Elevation River  Port Temperature
## 1    -1.178   0.59604 2.506 2.155    -2.03699
## 2    -1.097  -0.40021 2.545 2.173    -2.06540
## 3    -1.076  -0.75025 2.572 2.173    -2.03699
## 4    -1.112   1.10713 2.427 2.151     0.02058
## 5    -1.077   0.01878 2.433 2.151     0.02058
## 6    -1.059   1.10713 2.433 2.143    -1.91359

Summaries

summary(northlands)
##    SmoothPop        Elevation          River              Port          Temperature    
##  Min.   :-1.533   Min.   :-7.475   Min.   :-2.0773   Min.   :-2.6163   Min.   :-3.188  
##  1st Qu.:-0.832   1st Qu.:-0.597   1st Qu.:-0.6213   1st Qu.:-0.7327   1st Qu.:-0.748  
##  Median :-0.426   Median :-0.001   Median :-0.0088   Median : 0.0701   Median : 0.207  
##  Mean   : 0.000   Mean   : 0.000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.000  
##  3rd Qu.: 0.717   3rd Qu.: 0.641   3rd Qu.: 0.5452   3rd Qu.: 0.7386   3rd Qu.: 0.820  
##  Max.   : 3.667   Max.   : 3.355   Max.   : 2.6272   Max.   : 2.2921   Max.   : 1.979

Univariate visualization

par(mfrow = c(5, 2))
for (i in 1:5) {
    var = names(northlands)[i]
    plot(density(northlands[, i]), main = paste("Density of ", var))
    plot(sort(northlands[, i]), main = paste("IndexPlot of ", var), pch = ".")
}

plot of chunk unnamed-chunk-3

Bivariate visualization

pairs(~., data = northlands, main = "Simple Scatterplot Matrix")

plot of chunk unnamed-chunk-4

FINDING RELATIONSHIP WITH PREDICTORS

Finding Predictors

northlandsSmoothPFit <- lm(SmoothPop ~ ., data = northlands)
cbind(summary(northlandsSmoothPFit)$coeff, confint(northlandsSmoothPFit))
##               Estimate Std. Error    t value  Pr(>|t|)    2.5 %   97.5 %
## (Intercept)  4.448e-13   0.002041  2.179e-10 1.000e+00 -0.00400  0.00400
## Elevation   -2.780e-02   0.002134 -1.303e+01 9.002e-39 -0.03198 -0.02362
## River        1.323e-01   0.002359  5.609e+01 0.000e+00  0.12769  0.13693
## Port         1.367e-01   0.003460  3.950e+01 0.000e+00  0.12989  0.14345
## Temperature  9.371e-01   0.003783  2.477e+02 0.000e+00  0.92971  0.94454

Evaluating Design

For each variable

nlwithNoTemp <- lm(SmoothPop ~ Elevation + River + Port, data = northlands)
nlwithNoPort <- lm(SmoothPop ~ Elevation + River + Temperature, data = northlands)
nlwithNoRiver <- lm(SmoothPop ~ Elevation + Port + Temperature, data = northlands)
nlwithNoElevation <- lm(SmoothPop ~ River + Port + Temperature, data = northlands)

anova(northlandsSmoothPFit, nlwithNoTemp)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + River + Port
##   Res.Df   RSS Df Sum of Sq     F Pr(>F)    
## 1  99322 41098                              
## 2  99323 66487 -1    -25389 61360 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + River + Temperature
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 41098                             
## 2  99323 41743 -1      -646 1560 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoRiver)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + Port + Temperature
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 41098                             
## 2  99323 42400 -1     -1302 3147 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoElevation)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ River + Port + Temperature
##   Res.Df   RSS Df Sum of Sq   F Pr(>F)    
## 1  99322 41098                            
## 2  99323 41168 -1     -70.2 170 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For pair of variables

nlwithNoTempElev <- lm(SmoothPop ~ I(Elevation + Temperature) + River + Port, data = northlands)
nlwithNoRiverPort <- lm(SmoothPop ~ Elevation + Temperature + I(River + Port), data = northlands)
nlwithNoTempRiver <- lm(SmoothPop ~ Elevation + I(Temperature + River) + Port, data = northlands)
nlwithNoElevPort <- lm(SmoothPop ~ I(Elevation + Port) + Temperature + River, data = northlands)
nlwithNoElevRiver <- lm(SmoothPop ~ I(Elevation + River) + Temperature + Port, data = northlands)
nlwithNoTempPort <- lm(SmoothPop ~ Elevation + River + I(Temperature + Port), data = northlands)

anova(northlandsSmoothPFit, nlwithNoTempElev)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ I(Elevation + Temperature) + River + Port
##   Res.Df   RSS Df Sum of Sq     F Pr(>F)    
## 1  99322 41098                              
## 2  99323 58296 -1    -17199 41565 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoRiverPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + Temperature + I(River + Port)
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)
## 1  99322 41098                         
## 2  99323 41098 -1    -0.473 1.14   0.29
anova(northlandsSmoothPFit, nlwithNoTempRiver)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + I(Temperature + River) + Port
##   Res.Df   RSS Df Sum of Sq     F Pr(>F)    
## 1  99322 41098                              
## 2  99323 60882 -1    -19784 47814 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoElevPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ I(Elevation + Port) + Temperature + River
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 41098                             
## 2  99323 41735 -1      -638 1541 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoElevRiver)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ I(Elevation + River) + Temperature + Port
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 41098                             
## 2  99323 42121 -1     -1023 2473 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoTempPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + River + I(Temperature + Port)
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)    
## 1  99322 41098                               
## 2  99323 83454 -1    -42357 102366 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Verifying assumptions

Looking for outliers visually

par(mfrow = c(2, 2))
plot(northlandsSmoothPFit$res, ylab = "Residuals", main = "Index plot of residuals")
x <- model.matrix(northlandsSmoothPFit)
lev <- hat(x)
plot(lev, ylab = "Leverages", main = "Index plot of Leverages")
results <- summary(northlandsSmoothPFit)
stud <- northlandsSmoothPFit$res/(results$sig * sqrt(1 - lev))
plot(stud, ylab = "Studentized Residuals", main = "Studentized Residuals")
cook <- cooks.distance(northlandsSmoothPFit)
plot(cook, ylab = "Cooks distances")

plot of chunk unnamed-chunk-8

Influence of possible outlier in model

northlandsSmoothPFit2 <- lm(SmoothPop ~ ., data = northlands, subset = (cook < max(cook)))
summary(northlandsSmoothPFit2)
## 
## Call:
## lm(formula = SmoothPop ~ ., data = northlands, subset = (cook < 
##     max(cook)))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.036 -0.450 -0.016  0.396  2.708 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.61e-05   2.04e-03   -0.01     0.99    
## Elevation   -2.77e-02   2.13e-03  -13.00   <2e-16 ***
## River        1.32e-01   2.36e-03   56.12   <2e-16 ***
## Port         1.37e-01   3.46e-03   39.53   <2e-16 ***
## Temperature  9.37e-01   3.78e-03  247.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.643 on 99321 degrees of freedom
## Multiple R-squared:  0.586,  Adjusted R-squared:  0.586 
## F-statistic: 3.52e+04 on 4 and 99321 DF,  p-value: <2e-16
summary(northlandsSmoothPFit)
## 
## Call:
## lm(formula = SmoothPop ~ ., data = northlands)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.037 -0.450 -0.016  0.396  2.707 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.45e-13   2.04e-03     0.0        1    
## Elevation   -2.78e-02   2.13e-03   -13.0   <2e-16 ***
## River        1.32e-01   2.36e-03    56.1   <2e-16 ***
## Port         1.37e-01   3.46e-03    39.5   <2e-16 ***
## Temperature  9.37e-01   3.78e-03   247.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.643 on 99322 degrees of freedom
## Multiple R-squared:  0.586,  Adjusted R-squared:  0.586 
## F-statistic: 3.52e+04 on 4 and 99322 DF,  p-value: <2e-16

Heteroskedacity?

plot(northlandsSmoothPFit$fit, northlandsSmoothPFit$res, xlab = "Fitted", ylab = "Residuals")
abline(h = 0)

plot of chunk unnamed-chunk-10

Using test:

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(northlandsSmoothPFit)
## 
##  studentized Breusch-Pagan test
## 
## data:  northlandsSmoothPFit
## BP = 11378, df = 4, p-value < 2.2e-16
library(sandwich)
northlandsSmoothPFit$newse <- vcovHC(northlandsSmoothPFit)
coeftest(northlandsSmoothPFit, northlandsSmoothPFit$newse)
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.45e-13   2.04e-03     0.0        1    
## Elevation   -2.78e-02   2.47e-03   -11.3   <2e-16 ***
## River        1.32e-01   2.29e-03    57.8   <2e-16 ***
## Port         1.37e-01   4.10e-03    33.3   <2e-16 ***
## Temperature  9.37e-01   4.04e-03   231.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Normality of Residuals

par(mfrow = c(1, 2))
qqnorm(northlandsSmoothPFit$res, ylab = "Raw Residuals")
qqline(northlandsSmoothPFit$res)
qqnorm(rstudent(northlandsSmoothPFit), ylab = "Studentized residuals")
abline(0, 1)

plot of chunk unnamed-chunk-12

Multicollinearity

library(car)
coltest = vif(northlandsSmoothPFit)  # variance inflation factors 
sqrt(coltest) > 2  # problem?
##   Elevation       River        Port Temperature 
##       FALSE       FALSE       FALSE       FALSE