options(width = 320)
northlands = read.csv("Canada_raw.csv")
head(northlands)
## Row Column SmoothPop Elevation River Port Temperature
## 1 546 475 1.514 566 607.6 1461 258.0
## 2 547 473 2.103 233 641.5 1471 257.8
## 3 547 475 2.275 166 666.0 1472 258.0
## 4 548 466 1.981 871 543.8 1458 273.1
## 5 548 468 2.265 342 548.2 1458 273.1
## 6 548 469 2.422 871 548.2 1453 258.9
EDA <- function(data) {
print(summary(data))
par(mfrow = c(5, 2))
for (i in 1:ncol(data)) {
var = names(data)[i]
plot(density(data[, i]), main = paste("Density of ", var))
plot(sort(data[, i]), main = paste("IndexPlot of ", var), pch = ".")
}
pairs(~., data, main = "Simple Scatterplot Matrix")
}
EDA(northlands[, 3:7])
## SmoothPop Elevation River Port Temperature
## Min. : 0 Min. : -42 Min. : 0.0 Min. : 0 Min. :249
## 1st Qu.: 5 1st Qu.: 193 1st Qu.: 6.7 1st Qu.: 245 1st Qu.:267
## Median : 17 Median : 336 Median : 17.1 Median : 481 Median :275
## Mean : 2806 Mean : 467 Mean : 49.1 Mean : 518 Mean :273
## 3rd Qu.: 342 3rd Qu.: 588 3rd Qu.: 38.2 3rd Qu.: 738 3rd Qu.:279
## Max. :721316 Max. :5409 Max. :719.7 Max. :1544 Max. :288
Znorthlands = as.data.frame(scale(northlands[, 3:7]))
EDA(Znorthlands)
## SmoothPop Elevation River Port Temperature
## Min. :-0.14 Min. :-1.218 Min. :-0.525 Min. :-1.558 Min. :-3.188
## 1st Qu.:-0.14 1st Qu.:-0.656 1st Qu.:-0.454 1st Qu.:-0.821 1st Qu.:-0.748
## Median :-0.13 Median :-0.313 Median :-0.343 Median :-0.112 Median : 0.207
## Mean : 0.00 Mean : 0.000 Mean : 0.000 Mean : 0.000 Mean : 0.000
## 3rd Qu.:-0.12 3rd Qu.: 0.290 3rd Qu.:-0.117 3rd Qu.: 0.659 3rd Qu.: 0.820
## Max. :34.72 Max. :11.829 Max. : 7.174 Max. : 3.080 Max. : 1.979
data = Znorthlands[, 2:5]
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(data, centers = i)$withinss)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 4966350)
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
# K-Means Cluster Analysis
fit <- kmeans(data, 3) # 3 cluster solution
# get cluster means
aggregate(data, by = list(fit$cluster), FUN = mean)
## Group.1 Elevation River Port Temperature
## 1 1 -0.2356 2.9955 1.8947 -1.8533
## 2 2 0.3032 -0.3238 -0.6516 0.6806
## 3 3 -0.4657 -0.1527 0.6718 -0.7315
# append cluster assignment
Znorthlands <- data.frame(Znorthlands, fit$cluster)
library(plyr)
# Break up data by cluster, then fit the specified model to each piece and return a list
models <- dlply(Znorthlands, "fit.cluster", function(df) lm(SmoothPop ~ . - fit.cluster, data = df))
# Apply coef to each model and return a data frame
ldply(models, coef)
## fit.cluster (Intercept) Elevation River Port Temperature
## 1 1 -0.1353 2.345e-05 0.0000124 -7.609e-05 -2.579e-05
## 2 2 -0.4869 -4.575e-02 -0.2103963 -2.253e-01 5.619e-01
## 3 3 -0.1331 -8.299e-04 -0.0001357 1.166e-03 3.026e-03
# Print the summary of each model
l_ply(models, summary, .print = TRUE)
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.31e-04 -4.13e-05 -7.10e-06 1.99e-05 3.07e-03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.35e-01 7.75e-06 -17470.20 < 2e-16 ***
## Elevation 2.35e-05 2.15e-06 10.90 < 2e-16 ***
## River 1.24e-05 1.74e-06 7.13 1.1e-12 ***
## Port -7.61e-05 4.36e-06 -17.46 < 2e-16 ***
## Temperature -2.58e-05 5.19e-06 -4.97 7.0e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000145 on 7953 degrees of freedom
## Multiple R-squared: 0.0523, Adjusted R-squared: 0.0519
## F-statistic: 110 on 4 and 7953 DF, p-value: <2e-16
##
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.22 -0.35 -0.14 0.07 33.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.48689 0.01317 -36.97 <2e-16 ***
## Elevation -0.04575 0.00475 -9.64 <2e-16 ***
## River -0.21040 0.02552 -8.24 <2e-16 ***
## Port -0.22532 0.00981 -22.98 <2e-16 ***
## Temperature 0.56190 0.01114 50.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.26 on 57768 degrees of freedom
## Multiple R-squared: 0.0718, Adjusted R-squared: 0.0718
## F-statistic: 1.12e+03 on 4 and 57768 DF, p-value: <2e-16
##
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.00580 -0.00150 -0.00067 0.00024 0.14161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.33e-01 6.70e-05 -1986.92 <2e-16 ***
## Elevation -8.30e-04 6.49e-05 -12.79 <2e-16 ***
## River -1.36e-04 7.82e-05 -1.74 0.083 .
## Port 1.17e-03 7.09e-05 16.44 <2e-16 ***
## Temperature 3.03e-03 7.23e-05 41.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00549 on 33591 degrees of freedom
## Multiple R-squared: 0.0665, Adjusted R-squared: 0.0664
## F-statistic: 598 on 4 and 33591 DF, p-value: <2e-16
mydataplot = cbind(northlands[, 1:2], Znorthlands)
plot(mydataplot$Column, mydataplot$Row, col = mydataplot$fit.cluster, pch = 20, cex = 1, ylim = rev(range(mydataplot$Row)))
legend("bottomleft", legend = paste("Group", 1:3), pch = 1, col = 1:5)