options(width = 320)
northlands = read.csv("Canada_raw.csv")
head(northlands)
## Row Column SmoothPop Elevation River Port Temperature
## 1 546 475 1.514 566 607.6 1461 258.0
## 2 547 473 2.103 233 641.5 1471 257.8
## 3 547 475 2.275 166 666.0 1472 258.0
## 4 548 466 1.981 871 543.8 1458 273.1
## 5 548 468 2.265 342 548.2 1458 273.1
## 6 548 469 2.422 871 548.2 1453 258.9
EDA <- function(data) {
print(summary(data))
par(mfrow = c(5, 2))
for (i in 1:ncol(data)) {
var = names(data)[i]
plot(density(data[, i]), main = paste("Density of ", var))
plot(sort(data[, i]), main = paste("IndexPlot of ", var), pch = ".")
}
pairs(~., data, main = "Simple Scatterplot Matrix")
}
EDA(northlands[, 3:7])
## SmoothPop Elevation River Port Temperature
## Min. : 0 Min. : -42 Min. : 0.0 Min. : 0 Min. :249
## 1st Qu.: 5 1st Qu.: 193 1st Qu.: 6.7 1st Qu.: 245 1st Qu.:267
## Median : 17 Median : 336 Median : 17.1 Median : 481 Median :275
## Mean : 2806 Mean : 467 Mean : 49.1 Mean : 518 Mean :273
## 3rd Qu.: 342 3rd Qu.: 588 3rd Qu.: 38.2 3rd Qu.: 738 3rd Qu.:279
## Max. :721316 Max. :5409 Max. :719.7 Max. :1544 Max. :288
library(preprocessCore)
Nnorthlands = as.matrix(northlands[, 3:7])
Nnorthlands = normalize.quantiles(Nnorthlands)
Nnorthlands = as.data.frame(Nnorthlands)
names(Nnorthlands) = names(northlands[, 3:7])
EDA(Nnorthlands)
## SmoothPop Elevation River Port Temperature
## Min. : 50 Min. : 41 Min. : 59 Min. : 46 Min. : 50
## 1st Qu.: 139 1st Qu.: 144 1st Qu.: 143 1st Qu.: 144 1st Qu.: 144
## Median : 225 Median : 225 Median : 225 Median : 225 Median : 225
## Mean : 823 Mean : 823 Mean : 823 Mean : 823 Mean : 823
## 3rd Qu.: 397 3rd Qu.: 397 3rd Qu.: 397 3rd Qu.: 397 3rd Qu.: 397
## Max. :145855 Max. :145855 Max. :145855 Max. :145855 Max. :136482
data = Nnorthlands[, 2:5]
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(data, centers = i)$withinss)
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
# K-Means Cluster Analysis
fit <- kmeans(data, 5) # 5 cluster solution
# get cluster means
aggregate(data, by = list(fit$cluster), FUN = mean)
## Group.1 Elevation River Port Temperature
## 1 1 670.6 80244.6 66560.4 58.30
## 2 2 638.0 423.1 420.9 656.10
## 3 3 984.8 31522.5 38505.2 64.30
## 4 4 897.1 10753.7 10570.8 60.84
## 5 5 32237.6 195.0 115.6 32399.06
# append cluster assignment
Nnorthlands <- data.frame(Nnorthlands, fit$cluster)
library(plyr)
# Break up data by cluster, then fit the specified model to each piece and return a list
models <- dlply(Nnorthlands, "fit.cluster", function(df) lm(SmoothPop ~ . - fit.cluster, data = df))
# Apply coef to each model and return a data frame
ldply(models, coef)
## fit.cluster (Intercept) Elevation River Port Temperature
## 1 1 144.3 0.005713 -3.542e-04 -2.396e-04 -0.33200
## 2 2 406.0 -0.017290 -7.567e-02 -1.502e-01 0.78229
## 3 3 101.2 0.006448 -5.303e-05 -7.933e-05 -0.18951
## 4 4 112.8 0.004764 5.545e-04 -1.396e-03 -0.17098
## 5 5 14688.9 -0.072851 -5.592e+00 -3.966e+01 -0.04412
# Print the summary of each model
l_ply(models, summary, .print = TRUE)
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.92 -7.31 1.88 7.41 49.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.44e+02 8.08e+00 17.84 < 2e-16 ***
## Elevation 5.71e-03 1.72e-03 3.33 0.0011 **
## River -3.54e-04 5.28e-05 -6.71 4.6e-10 ***
## Port -2.40e-04 4.01e-05 -5.97 1.9e-08 ***
## Temperature -3.32e-01 6.81e-02 -4.88 2.9e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.5 on 139 degrees of freedom
## Multiple R-squared: 0.414, Adjusted R-squared: 0.397
## F-statistic: 24.6 on 4 and 139 DF, p-value: 2.17e-15
##
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23159 -342 -271 -156 130754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 406.03529 16.69161 24.33 < 2e-16 ***
## Elevation -0.01729 0.00621 -2.78 0.00537 **
## River -0.07567 0.02272 -3.33 0.00087 ***
## Port -0.15016 0.02328 -6.45 1.1e-10 ***
## Temperature 0.78229 0.00621 125.96 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3860 on 96601 degrees of freedom
## Multiple R-squared: 0.146, Adjusted R-squared: 0.146
## F-statistic: 4.14e+03 on 4 and 96601 DF, p-value: <2e-16
##
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53.78 -9.68 1.46 10.34 49.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.01e+02 3.85e+00 26.26 < 2e-16 ***
## Elevation 6.45e-03 6.24e-04 10.33 < 2e-16 ***
## River -5.30e-05 6.32e-05 -0.84 0.40
## Port -7.93e-05 5.26e-05 -1.51 0.13
## Temperature -1.90e-01 2.48e-02 -7.64 2.1e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.5 on 357 degrees of freedom
## Multiple R-squared: 0.343, Adjusted R-squared: 0.336
## F-statistic: 46.6 on 4 and 357 DF, p-value: <2e-16
##
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.80 -20.64 -5.97 7.05 123.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.13e+02 3.63e+00 31.08 < 2e-16 ***
## Elevation 4.76e-03 5.15e-04 9.24 < 2e-16 ***
## River 5.55e-04 1.27e-04 4.38 1.3e-05 ***
## Port -1.40e-03 1.42e-04 -9.85 < 2e-16 ***
## Temperature -1.71e-01 4.53e-02 -3.78 0.00016 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32.5 on 1647 degrees of freedom
## Multiple R-squared: 0.114, Adjusted R-squared: 0.112
## F-statistic: 52.9 on 4 and 1647 DF, p-value: <2e-16
##
##
## Call:
## lm(formula = SmoothPop ~ . - fit.cluster, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9316 -4719 -3037 222 60849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.47e+04 1.52e+03 9.66 < 2e-16 ***
## Elevation -7.29e-02 2.01e-02 -3.62 0.00033 ***
## River -5.59e+00 4.79e+00 -1.17 0.24340
## Port -3.97e+01 1.02e+01 -3.88 0.00012 ***
## Temperature -4.41e-02 1.59e-02 -2.77 0.00585 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9730 on 558 degrees of freedom
## Multiple R-squared: 0.178, Adjusted R-squared: 0.172
## F-statistic: 30.1 on 4 and 558 DF, p-value: <2e-16
mydataplot = cbind(northlands[, 1:2], Nnorthlands)
plot(mydataplot$Column, mydataplot$Row, col = mydataplot$fit.cluster, pch = 20, cex = 1, ylim = rev(range(mydataplot$Row)))
legend("bottomleft", legend = paste("Group", 1:5), pch = 1, col = 1:5)