NORTHLANDS - REGRESSING ON SMOOTHED POPULATION

EXPLORING THE DATA

Reading:

options(width = 320)
northlands = read.csv("Canada_zscore.csv")[, 4:8]
head(northlands)
##   SmoothPop Elevation River  Port Temperature
## 1   -0.1355    0.2371 5.975 2.831    -2.03699
## 2   -0.1355   -0.5599 6.338 2.863    -2.06540
## 3   -0.1355   -0.7203 6.599 2.864    -2.03699
## 4   -0.1355    0.9672 5.292 2.823     0.02058
## 5   -0.1355   -0.2990 5.339 2.823     0.02058
## 6   -0.1355    0.9672 5.339 2.809    -1.91359

Summaries

summary(northlands)
##    SmoothPop       Elevation          River             Port         Temperature    
##  Min.   :-0.14   Min.   :-1.218   Min.   :-0.525   Min.   :-1.558   Min.   :-3.188  
##  1st Qu.:-0.14   1st Qu.:-0.656   1st Qu.:-0.454   1st Qu.:-0.821   1st Qu.:-0.748  
##  Median :-0.13   Median :-0.313   Median :-0.343   Median :-0.112   Median : 0.207  
##  Mean   : 0.00   Mean   : 0.000   Mean   : 0.000   Mean   : 0.000   Mean   : 0.000  
##  3rd Qu.:-0.12   3rd Qu.: 0.290   3rd Qu.:-0.117   3rd Qu.: 0.659   3rd Qu.: 0.820  
##  Max.   :34.72   Max.   :11.829   Max.   : 7.174   Max.   : 3.080   Max.   : 1.979

Univariate visualization

par(mfrow = c(5, 2))
for (i in 1:5) {
    var = names(northlands)[i]
    plot(density(northlands$SmoothPop), main = paste("Density of ", var))
    plot(sort(northlands$SmoothPop), main = paste("IndexPlot of ", var), pch = ".")
}

plot of chunk unnamed-chunk-3

Bivariate visualization

pairs(~., data = northlands, main = "Simple Scatterplot Matrix")

plot of chunk unnamed-chunk-4

FINDING RELATIONSHIP WITH PREDICTORS

Finding Predictors

northlandsSmoothPFit <- lm(SmoothPop ~ ., data = northlands)
cbind(summary(northlandsSmoothPFit)$coeff, confint(northlandsSmoothPFit))
##               Estimate Std. Error    t value   Pr(>|t|)     2.5 %    97.5 %
## (Intercept)  9.782e-14   0.003099  3.157e-11  1.000e+00 -0.006074  0.006074
## Elevation   -8.010e-02   0.003188 -2.512e+01 7.487e-139 -0.086351 -0.073853
## River        1.132e-01   0.004010  2.824e+01 9.991e-175  0.105372  0.121091
## Port        -4.257e-02   0.005791 -7.351e+00  1.984e-13 -0.053922 -0.031220
## Temperature  2.335e-01   0.005821  4.012e+01  0.000e+00  0.222134  0.244952

Evaluating Design

For each variable

nlwithNoTemp <- lm(SmoothPop ~ Elevation + River + Port, data = northlands)
nlwithNoPort <- lm(SmoothPop ~ Elevation + River + Temperature, data = northlands)
nlwithNoRiver <- lm(SmoothPop ~ Elevation + Port + Temperature, data = northlands)
nlwithNoElevation <- lm(SmoothPop ~ River + Port + Temperature, data = northlands)

anova(northlandsSmoothPFit, nlwithNoTemp)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + River + Port
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 94731                             
## 2  99323 96266 -1     -1535 1610 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + River + Temperature
##   Res.Df   RSS Df Sum of Sq  F Pr(>F)    
## 1  99322 94731                           
## 2  99323 94782 -1     -51.5 54  2e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoRiver)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + Port + Temperature
##   Res.Df   RSS Df Sum of Sq   F Pr(>F)    
## 1  99322 94731                            
## 2  99323 95491 -1      -760 797 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoElevation)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ River + Port + Temperature
##   Res.Df   RSS Df Sum of Sq   F Pr(>F)    
## 1  99322 94731                            
## 2  99323 95333 -1      -602 631 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

For pair of variables

nlwithNoTempElev <- lm(SmoothPop ~ I(Elevation + Temperature) + River + Port, data = northlands)
nlwithNoRiverPort <- lm(SmoothPop ~ Elevation + Temperature + I(River + Port), data = northlands)
nlwithNoTempRiver <- lm(SmoothPop ~ Elevation + I(Temperature + River) + Port, data = northlands)
nlwithNoElevPort <- lm(SmoothPop ~ I(Elevation + Port) + Temperature + River, data = northlands)
nlwithNoElevRiver <- lm(SmoothPop ~ I(Elevation + River) + Temperature + Port, data = northlands)
nlwithNoTempPort <- lm(SmoothPop ~ Elevation + River + I(Temperature + Port), data = northlands)

anova(northlandsSmoothPFit, nlwithNoTempElev)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ I(Elevation + Temperature) + River + Port
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 94731                             
## 2  99323 96675 -1     -1944 2038 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoRiverPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + Temperature + I(River + Port)
##   Res.Df   RSS Df Sum of Sq   F Pr(>F)    
## 1  99322 94731                            
## 2  99323 95114 -1      -384 402 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoTempRiver)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + I(Temperature + River) + Port
##   Res.Df   RSS Df Sum of Sq   F Pr(>F)    
## 1  99322 94731                            
## 2  99323 95085 -1      -354 372 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoElevPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ I(Elevation + Port) + Temperature + River
##   Res.Df   RSS Df Sum of Sq    F  Pr(>F)    
## 1  99322 94731                              
## 2  99323 94763 -1     -32.2 33.8 6.3e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoElevRiver)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ I(Elevation + River) + Temperature + Port
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 94731                             
## 2  99323 95966 -1     -1235 1295 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(northlandsSmoothPFit, nlwithNoTempPort)
## Analysis of Variance Table
## 
## Model 1: SmoothPop ~ Elevation + River + Port + Temperature
## Model 2: SmoothPop ~ Elevation + River + I(Temperature + Port)
##   Res.Df   RSS Df Sum of Sq    F Pr(>F)    
## 1  99322 94731                             
## 2  99323 98708 -1     -3977 4170 <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Verifying assumptions

Looking for outliers visually

par(mfrow = c(2, 2))
plot(northlandsSmoothPFit$res, ylab = "Residuals", main = "Index plot of residuals")
x <- model.matrix(northlandsSmoothPFit)
lev <- hat(x)
plot(lev, ylab = "Leverages", main = "Index plot of Leverages")
results <- summary(northlandsSmoothPFit)
stud <- northlandsSmoothPFit$res/(results$sig * sqrt(1 - lev))
plot(stud, ylab = "Studentized Residuals", main = "Studentized Residuals")
cook <- cooks.distance(northlandsSmoothPFit)
plot(cook, ylab = "Cooks distances")

plot of chunk unnamed-chunk-8

Influence of possible outlier in model

northlandsSmoothPFit2 <- lm(SmoothPop ~ ., data = northlands, subset = (cook < max(cook)))
summary(northlandsSmoothPFit2)
## 
## Call:
## lm(formula = SmoothPop ~ ., data = northlands, subset = (cook < 
##     max(cook)))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -0.70  -0.23  -0.10   0.07  34.27 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.00033    0.00308   -0.11     0.91    
## Elevation   -0.07966    0.00317  -25.13  < 2e-16 ***
## River        0.11279    0.00399   28.29  < 2e-16 ***
## Port        -0.04228    0.00576   -7.34  2.1e-13 ***
## Temperature  0.23287    0.00579   40.23  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.971 on 99321 degrees of freedom
## Multiple R-squared:  0.0465, Adjusted R-squared:  0.0464 
## F-statistic: 1.21e+03 on 4 and 99321 DF,  p-value: <2e-16
summary(northlandsSmoothPFit)
## 
## Call:
## lm(formula = SmoothPop ~ ., data = northlands)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -0.71  -0.24  -0.10   0.07  34.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.78e-14   3.10e-03    0.00        1    
## Elevation   -8.01e-02   3.19e-03  -25.12   <2e-16 ***
## River        1.13e-01   4.01e-03   28.24   <2e-16 ***
## Port        -4.26e-02   5.79e-03   -7.35    2e-13 ***
## Temperature  2.34e-01   5.82e-03   40.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.977 on 99322 degrees of freedom
## Multiple R-squared:  0.0463, Adjusted R-squared:  0.0462 
## F-statistic: 1.2e+03 on 4 and 99322 DF,  p-value: <2e-16

Heteroskedacity?

plot(northlandsSmoothPFit$fit, northlandsSmoothPFit$res, xlab = "Fitted", ylab = "Residuals")
abline(h = 0)

plot of chunk unnamed-chunk-10

Using test:

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(northlandsSmoothPFit)
## 
##  studentized Breusch-Pagan test
## 
## data:  northlandsSmoothPFit
## BP = 836.8, df = 4, p-value < 2.2e-16
library(sandwich)
northlandsSmoothPFit$newse <- vcovHC(northlandsSmoothPFit)
coeftest(northlandsSmoothPFit, northlandsSmoothPFit$newse)
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.78e-14   3.10e-03     0.0        1    
## Elevation   -8.01e-02   3.63e-03   -22.1   <2e-16 ***
## River        1.13e-01   3.56e-03    31.8   <2e-16 ***
## Port        -4.26e-02   4.02e-03   -10.6   <2e-16 ***
## Temperature  2.34e-01   5.55e-03    42.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Normality of Residuals

par(mfrow = c(1, 2))
qqnorm(northlandsSmoothPFit$res, ylab = "Raw Residuals")
qqline(northlandsSmoothPFit$res)
qqnorm(rstudent(northlandsSmoothPFit), ylab = "Studentized residuals")
abline(0, 1)

plot of chunk unnamed-chunk-12

Multicollinearity

library(car)
coltest = vif(northlandsSmoothPFit)  # variance inflation factors 
sqrt(coltest) > 2  # problem?
##   Elevation       River        Port Temperature 
##       FALSE       FALSE       FALSE       FALSE