# Required libraries
library(MASS)
library(car)
library(lattice)
library(cluster)
library(fpc)
library(GPArotation)
library(FactoMineR)
#library(psych)
#library(nFactors)
Preview PA Climate Data
data<-read.csv("C:\\Users\\Jenn\\Documents\\R\\Multivariate\\paclimate.csv",header = TRUE)
climate <- data[6:13]
# Should we standardize our data?
variables<- var(climate)
# standardize data
scaled_climate <- scale(climate)
# Preview first 10 obs
knitr::kable(head(climate, n=10), digits = 3)
| 1233 |
464 |
1134 |
412 |
1499 |
634 |
923 |
224 |
| 1119 |
460 |
978 |
418 |
1342 |
631 |
779 |
223 |
| 1182 |
453 |
1122 |
410 |
1452 |
624 |
934 |
218 |
| 1115 |
462 |
1108 |
421 |
1332 |
630 |
902 |
229 |
| 1129 |
467 |
1033 |
419 |
1245 |
640 |
757 |
239 |
| 1342 |
469 |
1127 |
428 |
1406 |
650 |
1150 |
230 |
| 1098 |
469 |
986 |
432 |
1288 |
645 |
809 |
234 |
| 1404 |
470 |
1219 |
430 |
1377 |
647 |
907 |
240 |
| 1318 |
476 |
1185 |
434 |
1389 |
649 |
919 |
243 |
| 847 |
494 |
859 |
452 |
1133 |
677 |
544 |
262 |
Scatterplot Matrix
# Scatterplot Matrix
par(mfrow = c(1, 1))
scatterplotMatrix(climate)

Principal Component Analysis
# Principal Component Analysis
climate.pca <- prcomp(scaled_climate)
summary(climate.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0728 1.6918 0.66277 0.53339 0.30038 0.11114
## Proportion of Variance 0.5371 0.3578 0.05491 0.03556 0.01128 0.00154
## Cumulative Proportion 0.5371 0.8948 0.94974 0.98530 0.99658 0.99812
## PC7 PC8
## Standard deviation 0.10430 0.06424
## Proportion of Variance 0.00136 0.00052
## Cumulative Proportion 0.99948 1.00000
# Screeplot
screeplot(climate.pca, type="lines")

# Principal Component Factor Estimation
pc<-prcomp(climate,center=TRUE, scale=TRUE) #correlation PCA
pc$sdev
## [1] 2.07282889 1.69175128 0.66277180 0.53338878 0.30038343 0.11114463
## [7] 0.10429955 0.06423563
sum((pc$sdev)^2 > mean((pc$sdev)^2)) # two factors should be used (Kaiser's rule)
## [1] 2
Factor Loadings & Rotations
# no rotation
pcfactorloadings <- pc$rotation[,1:2]
print(pc$rotation[,1:2],digits = 2, cutoff=.3)
## PC1 PC2
## SON.PRCP.NORMAL -0.104 0.490
## SON.TAVG.NORMAL 0.476 0.052
## MAM.PRCP.NORMAL 0.055 0.535
## MAM.TAVG.NORMAL 0.478 0.051
## JJA.PRCP.NORMAL -0.272 0.394
## JJA.TAVG.NORMAL 0.475 0.040
## DJF.PRCP.NORMAL 0.060 0.555
## DJF.TAVG.NORMAL 0.477 0.057
load <- pcfactorloadings[,1:2]
plot(load,type="n",main="PCF,no rotation") # set up plot
text(load,labels=names(climate),cex=.7) # add variable names

# varimax rotation
rot<-varimax(pcfactorloadings, normalize = FALSE)
print(rot$loadings,digits = 2, cutoff=.3)
##
## Loadings:
## PC1 PC2
## SON.PRCP.NORMAL 0.50
## SON.TAVG.NORMAL 0.48
## MAM.PRCP.NORMAL 0.53
## MAM.TAVG.NORMAL 0.48
## JJA.PRCP.NORMAL 0.41
## JJA.TAVG.NORMAL 0.48
## DJF.PRCP.NORMAL 0.55
## DJF.TAVG.NORMAL 0.48
##
## PC1 PC2
## SS loadings 1.00 1.00
## Proportion Var 0.12 0.12
## Cumulative Var 0.12 0.25
load <- rot$loadings[,1:2]
plot(load,type="n",main="PCF,varimax") # set up plot
text(load,labels=names(climate),cex=.7) # add variable names

Boxplots for Seasonal Precipitation & Temperature
data_elevation <- read.csv("C:\\Users\\Jenn\\Documents\\R\\Multivariate\\paclimateHIGHMIDLOW.csv", header = TRUE)
# MANOVA
# 3 elevations, n = 139.
attach(data_elevation)
# Boxplots
par(mfrow = c(2, 2))
boxplot(V1 ~ group, data_elevation, main = "March-May Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V2 ~ group, data_elevation, main = "June-Aug Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V3 ~ group, data_elevation, main = "Sept-Nov Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V4 ~ group, data_elevation, main = "Dec-Feb Seasonal Precipitation",col = c("pink", " purple", "light blue"))

par(mfrow = c(2, 2))
boxplot(V5 ~ group, data_elevation, main = "Autumn Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V6 ~ group, data_elevation, main = "Winter Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V7 ~ group, data_elevation, main = "Spring Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V8 ~ group, data_elevation, main = "Summer Average Temperature",col = c("pink", " purple", "light blue"))

MANOVA & Univariate ANOVA Tables
# MANOVA
manova <- manova(cbind(V1,V2,V3,V4,V5,V6,V7,V8) ~ group, data_elevation)
manova
## Call:
## manova(cbind(V1, V2, V3, V4, V5, V6, V7, V8) ~ group, data_elevation)
##
## Terms:
## group Residuals
## resp 1 46682.6 1960267.8
## resp 2 4716.2 166990.5
## resp 3 95662.7 1439355.8
## resp 4 7131.2 200700.4
## resp 5 55015.8 2125826.3
## resp 6 5308.2 205795.8
## resp 7 93387.4 1861967.1
## resp 8 6566.0 214164.5
## Deg. of Freedom 2 135
##
## Residual standard errors: 120.501 35.17053 103.2565 38.55736 125.4865 39.04374 117.4408 39.82968
## Estimated effects may be unbalanced
# Univariate ANOVA Tables
summary(manova, test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## group 2 0.83453 1.5145 16 256 0.09448 .
## Residuals 135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova, test = "Roy")
## Df Roy approx F num Df den Df Pr(>F)
## group 2 0.14513 2.3402 8 129 0.02214 *
## Residuals 135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova, test = "Pillai")
## Df Pillai approx F num Df den Df Pr(>F)
## group 2 0.17109 1.5084 16 258 0.09651 .
## Residuals 135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova, test = "Hotelling-Lawley")
## Df Hotelling-Lawley approx F num Df den Df Pr(>F)
## group 2 0.19154 1.5203 16 254 0.09257 .
## Residuals 135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(manova) # univariate ANOVA tables
## Response V1 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 46683 23341 1.6075 0.2042
## Residuals 135 1960268 14520
##
## Response V2 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 4716 2358.1 1.9064 0.1526
## Residuals 135 166990 1237.0
##
## Response V3 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 95663 47831 4.4862 0.01299 *
## Residuals 135 1439356 10662
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response V4 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 7131 3565.6 2.3984 0.09473 .
## Residuals 135 200700 1486.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response V5 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 55016 27508 1.7469 0.1782
## Residuals 135 2125826 15747
##
## Response V6 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 5308 2654.1 1.741 0.1792
## Residuals 135 205796 1524.4
##
## Response V7 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 93387 46694 3.3855 0.03676 *
## Residuals 135 1861967 13792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response V8 :
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 6566 3283.0 2.0695 0.1302
## Residuals 135 214164 1586.4
Cluster Plot by Low, Mid, & High Elevation
# Clustering
elevationHIGH <- subset(data_elevation,select = c(V1:V8), group == "HIGH")
elevationMID <- subset(data_elevation,select = c(V1:V8), group == "MID")
elevationLOW <- subset(data_elevation,select = c(V1:V8), group == "LOW")
cluster <- c(data_elevation[4], nrow=1, byrow=TRUE)
par(mfrow = c(1, 1))
clusplot(climate, cluster$group, color=TRUE, shade=TRUE, labels=2, lines=0)

# Does not appear any distinct clusters. BAD
Cluster Analysis
# Gap Statistics
gap <- clusGap(data_elevation[7:14], FUN = kmeans, K.max = 8)
plot(gap) # there appears to be 3 clusters

# Euclidean Distance
scaled_climate <- scale(climate)
# dissimilarity: 2-norm
d_Euclidean <- dist(scaled_climate, method="euclidean")
# K-Means Cluster Analysis (based on Euclidean distance)
KmeansE <- kmeans(d_Euclidean, 3) # k=3 clusters
KmeansE$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 3 3 3 3 3 3 3 3 3 2 3 2 3 2 2 2 2 2
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 3 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 2 1 3 2 1 2 1 2 2 1 1 2 1 2 2 2 2
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 2 1 1 2 2 2 1 3 2 2 2 2 3 3 3 3 2 2
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 2 2 1 2 3 2 2 3 2 2 3 2 2 3 3 2 2 3
## 127 128 129 130 131 132 133 134 135 136 137 138
## 3 3 3 3 2 3 3 3 3 3 3 3
# dissimilarity: 1-norm
d_Manhattan <- dist(scaled_climate, method="manhattan")
# K-Means Cluster Analysis (based on Manhattan distance)
KmeansM <- kmeans(d_Manhattan, 3) # k=3 clusters
KmeansM$cluster
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 1 1 1 1 1 1 1 3 1 1 3 3 3 3
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 3 1 3 3 3 3 3 3 2 2 2 2 3 3 3 3 3 2
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 3 2 2 2 3 2 2 2 3 2 2 2 3 2 2 2 3 2
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 2 3 2 1 3 2 3 2 3 3 2 2 3 2 3 3 3 3
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 3 2 2 3 3 3 2 1 3 3 1 3 3 1 1 1 3 3
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 3 3 2 3 1 3 3 1 3 3 1 3 3 1 1 3 3 1
## 127 128 129 130 131 132 133 134 135 136 137 138
## 1 1 1 1 3 1 1 1 1 1 1 1
# cluster plot
par(mfrow = c(1, 1))
clusplot(scaled_climate, KmeansE$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)

clusplot(scaled_climate, KmeansM$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)

Comparing Temperatures & Precipation
# comparison - TEMPERATURES
compare_temp <- data.frame(data_elevation[8], data_elevation[14], data_elevation[10], data_elevation[12],KmeansE$cluster)
# Autumn temp avg, Winter temp avg, Spring temp avg, Summer temp avg
head(compare_temp, n=10)
## V2 V8 V4 V6 KmeansE.cluster
## 1 464 224 412 634 3
## 2 460 223 418 631 3
## 3 453 218 410 624 3
## 4 462 229 421 630 3
## 5 467 239 419 640 3
## 6 469 230 428 650 3
## 7 469 234 432 645 3
## 8 470 240 430 647 3
## 9 476 243 434 649 3
## 10 494 262 452 677 2
# 3 = low average temps (RED), #2 mid average temps (BLUE), #1 = high average temps (PURPLE)
# comparison - Precipitation
compare_precip <- data.frame(data_elevation[7], data_elevation[13], data_elevation[9], data_elevation[11],KmeansE$cluster)
# Autumn precip avg, Winter precip avg, Spring precip avg, Summer precip avg
head(compare_precip, n=10) # print first 10 lines
## V1 V7 V3 V5 KmeansE.cluster
## 1 1233 923 1134 1499 3
## 2 1119 779 978 1342 3
## 3 1182 934 1122 1452 3
## 4 1115 902 1108 1332 3
## 5 1129 757 1033 1245 3
## 6 1342 1150 1127 1406 3
## 7 1098 809 986 1288 3
## 8 1404 907 1219 1377 3
## 9 1318 919 1185 1389 3
## 10 847 544 859 1133 2
# Preciptitation does not give much information to analysis
# 3 = high precip (RED), #2 mid precip (BLUE), #1 = low precip (PURPLE)
# It appears this is a more appropriate clustering scheme
K-Means Clustering (Euclidean distance)
K <- kmeans(d_Euclidean, 3) # k=3 clusters
K$clust
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 3 3 3 3 3 3 3 3 3 2 3 2 3 2 2 2 2 2
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 3 2 2 2 2 2 2 1 1 1 1 2 2 2 2 2 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 2 1
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 1 2 1 3 2 1 2 1 2 2 1 1 2 1 2 2 2 2
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## 2 1 1 2 2 2 1 3 2 2 2 2 3 3 3 3 2 2
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## 2 2 1 2 3 2 2 3 2 2 3 2 2 3 3 2 2 3
## 127 128 129 130 131 132 133 134 135 136 137 138
## 3 3 3 3 2 3 3 3 3 3 3 3
compare <- data.frame(data[7], data[13], data[9], data[11],K$clust,data[6],data[12], data[8], data[10])
# Autumn temp avg, Winter temp avg, Spring temp avg, Summer temp avg, CLUSTERING Autumn precip avg, Winter precip avg, Spring precip avg, Summer precip avg
# print first 10 lines
knitr::kable(head(head(compare, n=10), n=10), digits = 3)
| 464 |
224 |
412 |
634 |
3 |
1233 |
923 |
1134 |
1499 |
| 460 |
223 |
418 |
631 |
3 |
1119 |
779 |
978 |
1342 |
| 453 |
218 |
410 |
624 |
3 |
1182 |
934 |
1122 |
1452 |
| 462 |
229 |
421 |
630 |
3 |
1115 |
902 |
1108 |
1332 |
| 467 |
239 |
419 |
640 |
3 |
1129 |
757 |
1033 |
1245 |
| 469 |
230 |
428 |
650 |
3 |
1342 |
1150 |
1127 |
1406 |
| 469 |
234 |
432 |
645 |
3 |
1098 |
809 |
986 |
1288 |
| 470 |
240 |
430 |
647 |
3 |
1404 |
907 |
1219 |
1377 |
| 476 |
243 |
434 |
649 |
3 |
1318 |
919 |
1185 |
1389 |
| 494 |
262 |
452 |
677 |
2 |
847 |
544 |
859 |
1133 |
1 (PURPLE) = high average temps & low precipitation
2 (BLUE) = mid average temps & mid precipitation
3 (RED) = low average temps & high precipitation
par(mfrow = c(2, 2))
boxplot(V1 ~ K$clust, data_elevation, main = "March-May Seasonal Precipitation", col = c("pink", "light green", "light blue"))
boxplot(V2 ~ K$clust, data_elevation, main = "June-Aug Seasonal Precipitation",col = c("pink", "light green", "light blue"))
boxplot(V3 ~ K$clust, data_elevation, main = "Sept-Nov Seasonal Precipitation",col = c("pink", "light green", "light blue"))
boxplot(V4 ~ K$clust, data_elevation, main = "Dec-Feb Seasonal Precipitation", col = c("pink", "light green", "light blue"))

par(mfrow = c(2, 2))
boxplot(V5 ~ K$clust, data_elevation, main = "Autumn Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V6 ~ K$clust, data_elevation, main = "Winter Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V7 ~ K$clust, data_elevation, main = "Spring Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V8 ~ K$clust, data_elevation, main = "Summer Average Temperature", col = c("pink", "light green", "light blue"))

Pittsburgh = 95 PITTSBURGH INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)
State College = 119 STATE COLLEGE PA US (RED = LOW TEMP)
Philadelpha = 89 PHILADELPHIA INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)
MANOVA of Regions
manova_region <- manova(cbind(V1,V2,V3,V4,V5,V6,V7,V8) ~ K$clust, data_elevation)
manova_region
## Call:
## manova(cbind(V1, V2, V3, V4, V5, V6, V7, V8) ~ K$clust, data_elevation)
##
## Terms:
## K$clust Residuals
## resp 1 30568 1976383
## resp 2 135230.1 36476.5
## resp 3 13696.7 1521321.8
## resp 4 164690.5 43141.1
## resp 5 474686.7 1706155.5
## resp 6 168846.0 42257.9
## resp 7 17844.9 1937509.5
## resp 8 175925.0 44805.5
## Deg. of Freedom 1 136
##
## Residual standard errors: 120.5497 16.37711 105.7648 17.81051 112.0056 17.62725 119.3583 18.15082
## Estimated effects may be unbalanced
summary(manova_region, test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## K$clust 1 0.1815 72.716 8 129 < 2.2e-16 ***
## Residuals 136
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1