NORTHLANDS - REGRESSING ON SMOOTHED POPULATION

EXPLORING THE DATA

Reading:

options(width = 320)
northlands = read.csv("Canada_raw.csv")
head(northlands)
##   Row Column SmoothPop Elevation River Port Temperature
## 1 546    475     1.514       566 607.6 1461       258.0
## 2 547    473     2.103       233 641.5 1471       257.8
## 3 547    475     2.275       166 666.0 1472       258.0
## 4 548    466     1.981       871 543.8 1458       273.1
## 5 548    468     2.265       342 548.2 1458       273.1
## 6 548    469     2.422       871 548.2 1453       258.9

EDA function

EDA <- function(data) {
    print(summary(data))
    par(mfrow = c(5, 2))
    for (i in 1:ncol(data)) {
        var = names(data)[i]
        plot(density(data[, i]), main = paste("Density of ", var))
        plot(sort(data[, i]), main = paste("IndexPlot of ", var), pch = ".")
    }
    pairs(~., data, main = "Simple Scatterplot Matrix")
}
EDA(northlands[, 3:7])
##    SmoothPop        Elevation        River            Port       Temperature 
##  Min.   :     0   Min.   : -42   Min.   :  0.0   Min.   :   0   Min.   :249  
##  1st Qu.:     5   1st Qu.: 193   1st Qu.:  6.7   1st Qu.: 245   1st Qu.:267  
##  Median :    17   Median : 336   Median : 17.1   Median : 481   Median :275  
##  Mean   :  2806   Mean   : 467   Mean   : 49.1   Mean   : 518   Mean   :273  
##  3rd Qu.:   342   3rd Qu.: 588   3rd Qu.: 38.2   3rd Qu.: 738   3rd Qu.:279  
##  Max.   :721316   Max.   :5409   Max.   :719.7   Max.   :1544   Max.   :288

plot of chunk unnamed-chunk-3 plot of chunk unnamed-chunk-3

NORMALIZING

library(preprocessCore)
Nnorthlands = as.matrix(northlands[, 3:7])
Nnorthlands = normalize.quantiles(Nnorthlands)
Nnorthlands = as.data.frame(Nnorthlands)
names(Nnorthlands) = names(northlands[, 3:7])
EDA(Nnorthlands)
##    SmoothPop        Elevation          River             Port         Temperature    
##  Min.   :    50   Min.   :    41   Min.   :    59   Min.   :    46   Min.   :    50  
##  1st Qu.:   139   1st Qu.:   144   1st Qu.:   143   1st Qu.:   144   1st Qu.:   144  
##  Median :   225   Median :   225   Median :   225   Median :   225   Median :   225  
##  Mean   :   823   Mean   :   823   Mean   :   823   Mean   :   823   Mean   :   823  
##  3rd Qu.:   397   3rd Qu.:   397   3rd Qu.:   397   3rd Qu.:   397   3rd Qu.:   397  
##  Max.   :145855   Max.   :145855   Max.   :145855   Max.   :145855   Max.   :136482

plot of chunk unnamed-chunk-4 plot of chunk unnamed-chunk-4

data = Nnorthlands[, 2:5]
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(data, centers = i)$withinss)
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

plot of chunk unnamed-chunk-5

# K-Means Cluster Analysis
fit <- kmeans(data, 5)  # 5 cluster solution
# get cluster means
aggregate(data, by = list(fit$cluster), FUN = mean)
##   Group.1 Elevation   River    Port Temperature
## 1       1     670.6 80244.6 66560.4       58.30
## 2       2     638.0   423.1   420.9      656.10
## 3       3     984.8 31522.5 38505.2       64.30
## 4       4     897.1 10753.7 10570.8       60.84
## 5       5   32237.6   195.0   115.6    32399.06
# append cluster assignment
Nnorthlands <- data.frame(Nnorthlands, fit$cluster)
library(plyr)
# Break up data by cluster, then fit the specified model to each piece and return a list
models <- dlply(Nnorthlands, "fit.cluster", function(df) lm(SmoothPop ~ . - fit.cluster, data = df))

# Apply coef to each model and return a data frame
ldply(models, coef)
##   fit.cluster (Intercept) Elevation      River       Port Temperature
## 1           1       144.3  0.005713 -3.542e-04 -2.396e-04    -0.33200
## 2           2       406.0 -0.017290 -7.567e-02 -1.502e-01     0.78229
## 3           3       101.2  0.006448 -5.303e-05 -7.933e-05    -0.18951
## 4           4       112.8  0.004764  5.545e-04 -1.396e-03    -0.17098
## 5           5     14688.9 -0.072851 -5.592e+00 -3.966e+01    -0.04412

# Print the summary of each model
l_ply(models, summary, .print = TRUE)
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -37.92  -7.31   1.88   7.41  49.71 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.44e+02   8.08e+00   17.84  < 2e-16 ***
## Elevation    5.71e-03   1.72e-03    3.33   0.0011 ** 
## River       -3.54e-04   5.28e-05   -6.71  4.6e-10 ***
## Port        -2.40e-04   4.01e-05   -5.97  1.9e-08 ***
## Temperature -3.32e-01   6.81e-02   -4.88  2.9e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.5 on 139 degrees of freedom
## Multiple R-squared:  0.414,  Adjusted R-squared:  0.397 
## F-statistic: 24.6 on 4 and 139 DF,  p-value: 2.17e-15
## 
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -23159   -342   -271   -156 130754 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 406.03529   16.69161   24.33  < 2e-16 ***
## Elevation    -0.01729    0.00621   -2.78  0.00537 ** 
## River        -0.07567    0.02272   -3.33  0.00087 ***
## Port         -0.15016    0.02328   -6.45  1.1e-10 ***
## Temperature   0.78229    0.00621  125.96  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3860 on 96601 degrees of freedom
## Multiple R-squared:  0.146,  Adjusted R-squared:  0.146 
## F-statistic: 4.14e+03 on 4 and 96601 DF,  p-value: <2e-16
## 
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53.78  -9.68   1.46  10.34  49.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.01e+02   3.85e+00   26.26  < 2e-16 ***
## Elevation    6.45e-03   6.24e-04   10.33  < 2e-16 ***
## River       -5.30e-05   6.32e-05   -0.84     0.40    
## Port        -7.93e-05   5.26e-05   -1.51     0.13    
## Temperature -1.90e-01   2.48e-02   -7.64  2.1e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.5 on 357 degrees of freedom
## Multiple R-squared:  0.343,  Adjusted R-squared:  0.336 
## F-statistic: 46.6 on 4 and 357 DF,  p-value: <2e-16
## 
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66.80 -20.64  -5.97   7.05 123.52 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.13e+02   3.63e+00   31.08  < 2e-16 ***
## Elevation    4.76e-03   5.15e-04    9.24  < 2e-16 ***
## River        5.55e-04   1.27e-04    4.38  1.3e-05 ***
## Port        -1.40e-03   1.42e-04   -9.85  < 2e-16 ***
## Temperature -1.71e-01   4.53e-02   -3.78  0.00016 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32.5 on 1647 degrees of freedom
## Multiple R-squared:  0.114,  Adjusted R-squared:  0.112 
## F-statistic: 52.9 on 4 and 1647 DF,  p-value: <2e-16
## 
## 
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9316  -4719  -3037    222  60849 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.47e+04   1.52e+03    9.66  < 2e-16 ***
## Elevation   -7.29e-02   2.01e-02   -3.62  0.00033 ***
## River       -5.59e+00   4.79e+00   -1.17  0.24340    
## Port        -3.97e+01   1.02e+01   -3.88  0.00012 ***
## Temperature -4.41e-02   1.59e-02   -2.77  0.00585 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9730 on 558 degrees of freedom
## Multiple R-squared:  0.178,  Adjusted R-squared:  0.172 
## F-statistic: 30.1 on 4 and 558 DF,  p-value: <2e-16
mydataplot = cbind(northlands[, 1:2], Nnorthlands)
plot(mydataplot$Column, mydataplot$Row, col = mydataplot$fit.cluster, pch = 20, cex = 1, ylim = rev(range(mydataplot$Row)))
legend("bottomleft", legend = paste("Group", 1:5), pch = 1, col = 1:5)

plot of chunk unnamed-chunk-7