NORTHLANDS - REGRESSING ON SMOOTHED POPULATION

EXPLORING THE DATA

Reading:

options(width = 320)
northlands = read.csv("Canada_raw.csv")
head(northlands)
##   Row Column SmoothPop Elevation River Port Temperature
## 1 546    475     1.514       566 607.6 1461       258.0
## 2 547    473     2.103       233 641.5 1471       257.8
## 3 547    475     2.275       166 666.0 1472       258.0
## 4 548    466     1.981       871 543.8 1458       273.1
## 5 548    468     2.265       342 548.2 1458       273.1
## 6 548    469     2.422       871 548.2 1453       258.9

EDA function

EDA <- function(data) {
    print(summary(data))
    par(mfrow = c(5, 2))
    for (i in 1:ncol(data)) {
        var = names(data)[i]
        plot(density(data[, i]), main = paste("Density of ", var))
        plot(sort(data[, i]), main = paste("IndexPlot of ", var), pch = ".")
    }
    pairs(~., data, main = "Simple Scatterplot Matrix")
}
EDA(northlands[, 3:7])
##    SmoothPop        Elevation        River            Port       Temperature 
##  Min.   :     0   Min.   : -42   Min.   :  0.0   Min.   :   0   Min.   :249  
##  1st Qu.:     5   1st Qu.: 193   1st Qu.:  6.7   1st Qu.: 245   1st Qu.:267  
##  Median :    17   Median : 336   Median : 17.1   Median : 481   Median :275  
##  Mean   :  2806   Mean   : 467   Mean   : 49.1   Mean   : 518   Mean   :273  
##  3rd Qu.:   342   3rd Qu.: 588   3rd Qu.: 38.2   3rd Qu.: 738   3rd Qu.:279  
##  Max.   :721316   Max.   :5409   Max.   :719.7   Max.   :1544   Max.   :288

plot of chunk unnamed-chunk-4 plot of chunk unnamed-chunk-4

Z-SCORES NORMALIZATION


Znorthlands = as.data.frame(scale(northlands[, 3:7]))
EDA(Znorthlands)
##    SmoothPop       Elevation          River             Port         Temperature    
##  Min.   :-0.14   Min.   :-1.218   Min.   :-0.525   Min.   :-1.558   Min.   :-3.188  
##  1st Qu.:-0.14   1st Qu.:-0.656   1st Qu.:-0.454   1st Qu.:-0.821   1st Qu.:-0.748  
##  Median :-0.13   Median :-0.313   Median :-0.343   Median :-0.112   Median : 0.207  
##  Mean   : 0.00   Mean   : 0.000   Mean   : 0.000   Mean   : 0.000   Mean   : 0.000  
##  3rd Qu.:-0.12   3rd Qu.: 0.290   3rd Qu.:-0.117   3rd Qu.: 0.659   3rd Qu.: 0.820  
##  Max.   :34.72   Max.   :11.829   Max.   : 7.174   Max.   : 3.080   Max.   : 1.979

plot of chunk unnamed-chunk-5 plot of chunk unnamed-chunk-5

data = Znorthlands[, 2:5]
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(data, centers = i)$withinss)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4966350)
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

plot of chunk unnamed-chunk-6

# K-Means Cluster Analysis
fit <- kmeans(data, 3)  # 3 cluster solution
# get cluster means
aggregate(data, by = list(fit$cluster), FUN = mean)
##   Group.1 Elevation   River    Port Temperature
## 1       1   -0.2356  2.9955  1.8947     -1.8533
## 2       2    0.3032 -0.3238 -0.6516      0.6806
## 3       3   -0.4657 -0.1527  0.6718     -0.7315
# append cluster assignment
Znorthlands <- data.frame(Znorthlands, fit$cluster)
library(plyr)
# Break up data by cluster, then fit the specified model to each piece and return a list
models <- dlply(Znorthlands, "fit.cluster", function(df) lm(SmoothPop ~ . - fit.cluster, data = df))

# Apply coef to each model and return a data frame
ldply(models, coef)
##   fit.cluster (Intercept)  Elevation      River       Port Temperature
## 1           1     -0.1353  2.345e-05  0.0000124 -7.609e-05  -2.579e-05
## 2           2     -0.4869 -4.575e-02 -0.2103963 -2.253e-01   5.619e-01
## 3           3     -0.1331 -8.299e-04 -0.0001357  1.166e-03   3.026e-03

# Print the summary of each model
l_ply(models, summary, .print = TRUE)
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -2.31e-04 -4.13e-05 -7.10e-06  1.99e-05  3.07e-03 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) -1.35e-01   7.75e-06 -17470.20  < 2e-16 ***
## Elevation    2.35e-05   2.15e-06     10.90  < 2e-16 ***
## River        1.24e-05   1.74e-06      7.13  1.1e-12 ***
## Port        -7.61e-05   4.36e-06    -17.46  < 2e-16 ***
## Temperature -2.58e-05   5.19e-06     -4.97  7.0e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000145 on 7953 degrees of freedom
## Multiple R-squared:  0.0523, Adjusted R-squared:  0.0519 
## F-statistic:  110 on 4 and 7953 DF,  p-value: <2e-16
## 
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -1.22  -0.35  -0.14   0.07  33.83 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.48689    0.01317  -36.97   <2e-16 ***
## Elevation   -0.04575    0.00475   -9.64   <2e-16 ***
## River       -0.21040    0.02552   -8.24   <2e-16 ***
## Port        -0.22532    0.00981  -22.98   <2e-16 ***
## Temperature  0.56190    0.01114   50.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.26 on 57768 degrees of freedom
## Multiple R-squared:  0.0718, Adjusted R-squared:  0.0718 
## F-statistic: 1.12e+03 on 4 and 57768 DF,  p-value: <2e-16
## 
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.00580 -0.00150 -0.00067  0.00024  0.14161 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -1.33e-01   6.70e-05 -1986.92   <2e-16 ***
## Elevation   -8.30e-04   6.49e-05   -12.79   <2e-16 ***
## River       -1.36e-04   7.82e-05    -1.74    0.083 .  
## Port         1.17e-03   7.09e-05    16.44   <2e-16 ***
## Temperature  3.03e-03   7.23e-05    41.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00549 on 33591 degrees of freedom
## Multiple R-squared:  0.0665, Adjusted R-squared:  0.0664 
## F-statistic:  598 on 4 and 33591 DF,  p-value: <2e-16
mydataplot = cbind(northlands[, 1:2], Znorthlands)
plot(mydataplot$Column, mydataplot$Row, col = mydataplot$fit.cluster, pch = 20, cex = 1, ylim = rev(range(mydataplot$Row)))
legend("bottomleft", legend = paste("Group", 1:3), pch = 1, col = 1:5)

plot of chunk unnamed-chunk-8