# Required libraries
library(MASS)
library(car)
library(lattice)
library(cluster)
library(fpc)
library(GPArotation)
library(FactoMineR)
#library(psych)
#library(nFactors)

Preview PA Climate Data

data<-read.csv("C:\\Users\\Jenn\\Documents\\R\\Multivariate\\paclimate.csv",header = TRUE)
climate <- data[6:13]

# Should we standardize our data?
variables<- var(climate)
# standardize data
scaled_climate <- scale(climate)

# Preview first 10 obs
knitr::kable(head(climate, n=10), digits = 3)
SON.PRCP.NORMAL SON.TAVG.NORMAL MAM.PRCP.NORMAL MAM.TAVG.NORMAL JJA.PRCP.NORMAL JJA.TAVG.NORMAL DJF.PRCP.NORMAL DJF.TAVG.NORMAL
1233 464 1134 412 1499 634 923 224
1119 460 978 418 1342 631 779 223
1182 453 1122 410 1452 624 934 218
1115 462 1108 421 1332 630 902 229
1129 467 1033 419 1245 640 757 239
1342 469 1127 428 1406 650 1150 230
1098 469 986 432 1288 645 809 234
1404 470 1219 430 1377 647 907 240
1318 476 1185 434 1389 649 919 243
847 494 859 452 1133 677 544 262


Scatterplot Matrix

# Scatterplot Matrix
par(mfrow = c(1, 1))
scatterplotMatrix(climate)


Principal Component Analysis

# Principal Component Analysis
climate.pca <- prcomp(scaled_climate)
summary(climate.pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0728 1.6918 0.66277 0.53339 0.30038 0.11114
## Proportion of Variance 0.5371 0.3578 0.05491 0.03556 0.01128 0.00154
## Cumulative Proportion  0.5371 0.8948 0.94974 0.98530 0.99658 0.99812
##                            PC7     PC8
## Standard deviation     0.10430 0.06424
## Proportion of Variance 0.00136 0.00052
## Cumulative Proportion  0.99948 1.00000
# Screeplot
screeplot(climate.pca, type="lines")

# Principal Component Factor Estimation
pc<-prcomp(climate,center=TRUE, scale=TRUE) #correlation PCA
pc$sdev
## [1] 2.07282889 1.69175128 0.66277180 0.53338878 0.30038343 0.11114463
## [7] 0.10429955 0.06423563
sum((pc$sdev)^2 > mean((pc$sdev)^2)) # two factors should be used (Kaiser's rule)
## [1] 2

Factor Loadings & Rotations

# no rotation
pcfactorloadings <- pc$rotation[,1:2]
print(pc$rotation[,1:2],digits = 2, cutoff=.3)
##                    PC1   PC2
## SON.PRCP.NORMAL -0.104 0.490
## SON.TAVG.NORMAL  0.476 0.052
## MAM.PRCP.NORMAL  0.055 0.535
## MAM.TAVG.NORMAL  0.478 0.051
## JJA.PRCP.NORMAL -0.272 0.394
## JJA.TAVG.NORMAL  0.475 0.040
## DJF.PRCP.NORMAL  0.060 0.555
## DJF.TAVG.NORMAL  0.477 0.057
load <- pcfactorloadings[,1:2]
plot(load,type="n",main="PCF,no rotation") # set up plot 
text(load,labels=names(climate),cex=.7) # add variable names

# varimax rotation
rot<-varimax(pcfactorloadings, normalize = FALSE)
print(rot$loadings,digits = 2, cutoff=.3)
## 
## Loadings:
##                 PC1   PC2  
## SON.PRCP.NORMAL        0.50
## SON.TAVG.NORMAL  0.48      
## MAM.PRCP.NORMAL        0.53
## MAM.TAVG.NORMAL  0.48      
## JJA.PRCP.NORMAL        0.41
## JJA.TAVG.NORMAL  0.48      
## DJF.PRCP.NORMAL        0.55
## DJF.TAVG.NORMAL  0.48      
## 
##                 PC1  PC2
## SS loadings    1.00 1.00
## Proportion Var 0.12 0.12
## Cumulative Var 0.12 0.25
load <- rot$loadings[,1:2] 
plot(load,type="n",main="PCF,varimax") # set up plot 
text(load,labels=names(climate),cex=.7) # add variable names


Boxplots for Seasonal Precipitation & Temperature

data_elevation <- read.csv("C:\\Users\\Jenn\\Documents\\R\\Multivariate\\paclimateHIGHMIDLOW.csv", header = TRUE)

# MANOVA
# 3 elevations, n = 139.
attach(data_elevation) 


# Boxplots
par(mfrow = c(2, 2))
boxplot(V1 ~ group, data_elevation, main = "March-May Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V2 ~ group, data_elevation, main = "June-Aug Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V3 ~ group, data_elevation, main = "Sept-Nov Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V4 ~ group, data_elevation, main = "Dec-Feb Seasonal Precipitation",col = c("pink", " purple", "light blue"))

par(mfrow = c(2, 2))
boxplot(V5 ~ group, data_elevation, main = "Autumn Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V6 ~ group, data_elevation, main = "Winter Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V7 ~ group, data_elevation, main = "Spring Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V8 ~ group, data_elevation, main = "Summer Average Temperature",col = c("pink", " purple", "light blue"))

MANOVA & Univariate ANOVA Tables

# MANOVA
manova <- manova(cbind(V1,V2,V3,V4,V5,V6,V7,V8) ~ group, data_elevation)
manova
## Call:
##    manova(cbind(V1, V2, V3, V4, V5, V6, V7, V8) ~ group, data_elevation)
## 
## Terms:
##                     group Residuals
## resp 1            46682.6 1960267.8
## resp 2             4716.2  166990.5
## resp 3            95662.7 1439355.8
## resp 4             7131.2  200700.4
## resp 5            55015.8 2125826.3
## resp 6             5308.2  205795.8
## resp 7            93387.4 1861967.1
## resp 8             6566.0  214164.5
## Deg. of Freedom         2       135
## 
## Residual standard errors: 120.501 35.17053 103.2565 38.55736 125.4865 39.04374 117.4408 39.82968
## Estimated effects may be unbalanced
# Univariate ANOVA Tables
summary(manova, test = "Wilks")
##            Df   Wilks approx F num Df den Df  Pr(>F)  
## group       2 0.83453   1.5145     16    256 0.09448 .
## Residuals 135                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova, test = "Roy")
##            Df     Roy approx F num Df den Df  Pr(>F)  
## group       2 0.14513   2.3402      8    129 0.02214 *
## Residuals 135                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova, test = "Pillai")
##            Df  Pillai approx F num Df den Df  Pr(>F)  
## group       2 0.17109   1.5084     16    258 0.09651 .
## Residuals 135                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(manova, test = "Hotelling-Lawley")
##            Df Hotelling-Lawley approx F num Df den Df  Pr(>F)  
## group       2          0.19154   1.5203     16    254 0.09257 .
## Residuals 135                                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(manova)        # univariate ANOVA tables
##  Response V1 :
##              Df  Sum Sq Mean Sq F value Pr(>F)
## group         2   46683   23341  1.6075 0.2042
## Residuals   135 1960268   14520               
## 
##  Response V2 :
##              Df Sum Sq Mean Sq F value Pr(>F)
## group         2   4716  2358.1  1.9064 0.1526
## Residuals   135 166990  1237.0               
## 
##  Response V3 :
##              Df  Sum Sq Mean Sq F value  Pr(>F)  
## group         2   95663   47831  4.4862 0.01299 *
## Residuals   135 1439356   10662                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V4 :
##              Df Sum Sq Mean Sq F value  Pr(>F)  
## group         2   7131  3565.6  2.3984 0.09473 .
## Residuals   135 200700  1486.7                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V5 :
##              Df  Sum Sq Mean Sq F value Pr(>F)
## group         2   55016   27508  1.7469 0.1782
## Residuals   135 2125826   15747               
## 
##  Response V6 :
##              Df Sum Sq Mean Sq F value Pr(>F)
## group         2   5308  2654.1   1.741 0.1792
## Residuals   135 205796  1524.4               
## 
##  Response V7 :
##              Df  Sum Sq Mean Sq F value  Pr(>F)  
## group         2   93387   46694  3.3855 0.03676 *
## Residuals   135 1861967   13792                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V8 :
##              Df Sum Sq Mean Sq F value Pr(>F)
## group         2   6566  3283.0  2.0695 0.1302
## Residuals   135 214164  1586.4


Cluster Plot by Low, Mid, & High Elevation

# Clustering 
elevationHIGH <- subset(data_elevation,select = c(V1:V8), group == "HIGH")
elevationMID <- subset(data_elevation,select = c(V1:V8), group == "MID")
elevationLOW <- subset(data_elevation,select = c(V1:V8), group == "LOW")

cluster <- c(data_elevation[4], nrow=1, byrow=TRUE)
par(mfrow = c(1, 1))
clusplot(climate, cluster$group, color=TRUE, shade=TRUE, labels=2, lines=0)

# Does not appear any distinct clusters. BAD 


Cluster Analysis

# Gap Statistics     
gap <- clusGap(data_elevation[7:14], FUN = kmeans, K.max = 8)
plot(gap)    # there appears to be 3 clusters    

# Euclidean Distance 
scaled_climate <- scale(climate)

# dissimilarity:  2-norm 
d_Euclidean <- dist(scaled_climate, method="euclidean")
# K-Means Cluster Analysis  (based on Euclidean distance)
KmeansE <- kmeans(d_Euclidean, 3) # k=3 clusters 
KmeansE$cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   3   3   3   3   3   3   3   3   2   3   2   3   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   3   2   2   2   2   2   2   1   1   1   1   2   2   2   2   2   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   1   1   1   2   1   1   1   2   1   1   1   2   1   1   1   2   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   2   1   3   2   1   2   1   2   2   1   1   2   1   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   2   1   1   2   2   2   1   3   2   2   2   2   3   3   3   3   2   2 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   2   2   1   2   3   2   2   3   2   2   3   2   2   3   3   2   2   3 
## 127 128 129 130 131 132 133 134 135 136 137 138 
##   3   3   3   3   2   3   3   3   3   3   3   3
# dissimilarity:  1-norm 
d_Manhattan <- dist(scaled_climate, method="manhattan")
# K-Means Cluster Analysis  (based on Manhattan distance)
KmeansM <- kmeans(d_Manhattan, 3) # k=3 clusters 
KmeansM$cluster
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   1   1   1   1   1   1   1   1   1   1   3   1   1   3   3   3   3 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   3   1   3   3   3   3   3   3   2   2   2   2   3   3   3   3   3   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   3   2   2   2   3   2   2   2   3   2   2   2   3   2   2   2   3   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   3   2   1   3   2   3   2   3   3   2   2   3   2   3   3   3   3 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   3   2   2   3   3   3   2   1   3   3   1   3   3   1   1   1   3   3 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   3   3   2   3   1   3   3   1   3   3   1   3   3   1   1   3   3   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 
##   1   1   1   1   3   1   1   1   1   1   1   1
# cluster plot
par(mfrow = c(1, 1))
clusplot(scaled_climate, KmeansE$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)

clusplot(scaled_climate, KmeansM$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)


Comparing Temperatures & Precipation

# comparison - TEMPERATURES
compare_temp <- data.frame(data_elevation[8], data_elevation[14], data_elevation[10], data_elevation[12],KmeansE$cluster)
# Autumn temp avg, Winter temp avg, Spring temp avg, Summer temp avg
head(compare_temp, n=10)
##     V2  V8  V4  V6 KmeansE.cluster
## 1  464 224 412 634               3
## 2  460 223 418 631               3
## 3  453 218 410 624               3
## 4  462 229 421 630               3
## 5  467 239 419 640               3
## 6  469 230 428 650               3
## 7  469 234 432 645               3
## 8  470 240 430 647               3
## 9  476 243 434 649               3
## 10 494 262 452 677               2
# 3 = low average temps (RED),  #2 mid average temps (BLUE),  #1 = high average temps  (PURPLE)


# comparison - Precipitation
compare_precip <- data.frame(data_elevation[7], data_elevation[13], data_elevation[9], data_elevation[11],KmeansE$cluster)
# Autumn precip avg, Winter precip avg, Spring precip avg, Summer precip avg
head(compare_precip, n=10)  # print first 10 lines
##      V1   V7   V3   V5 KmeansE.cluster
## 1  1233  923 1134 1499               3
## 2  1119  779  978 1342               3
## 3  1182  934 1122 1452               3
## 4  1115  902 1108 1332               3
## 5  1129  757 1033 1245               3
## 6  1342 1150 1127 1406               3
## 7  1098  809  986 1288               3
## 8  1404  907 1219 1377               3
## 9  1318  919 1185 1389               3
## 10  847  544  859 1133               2
# Preciptitation does not give much information to analysis
# 3 = high precip (RED),  #2 mid precip (BLUE),  #1 = low precip  (PURPLE)

# It appears this is a more appropriate clustering scheme


K-Means Clustering (Euclidean distance)

K <- kmeans(d_Euclidean, 3) # k=3 clusters 
K$clust
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   3   3   3   3   3   3   3   3   2   3   2   3   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   3   2   2   2   2   2   2   1   1   1   1   2   2   2   2   2   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   1   1   1   2   1   1   1   2   1   1   1   2   1   1   1   2   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   2   1   3   2   1   2   1   2   2   1   1   2   1   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   2   1   1   2   2   2   1   3   2   2   2   2   3   3   3   3   2   2 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   2   2   1   2   3   2   2   3   2   2   3   2   2   3   3   2   2   3 
## 127 128 129 130 131 132 133 134 135 136 137 138 
##   3   3   3   3   2   3   3   3   3   3   3   3
compare <- data.frame(data[7], data[13], data[9], data[11],K$clust,data[6],data[12], data[8], data[10])
# Autumn temp avg, Winter temp avg, Spring temp avg, Summer temp avg,  CLUSTERING Autumn precip avg, Winter precip avg, Spring precip avg, Summer precip avg

# print first 10 lines
knitr::kable(head(head(compare, n=10), n=10), digits = 3)
SON.TAVG.NORMAL DJF.TAVG.NORMAL MAM.TAVG.NORMAL JJA.TAVG.NORMAL K.clust SON.PRCP.NORMAL DJF.PRCP.NORMAL MAM.PRCP.NORMAL JJA.PRCP.NORMAL
464 224 412 634 3 1233 923 1134 1499
460 223 418 631 3 1119 779 978 1342
453 218 410 624 3 1182 934 1122 1452
462 229 421 630 3 1115 902 1108 1332
467 239 419 640 3 1129 757 1033 1245
469 230 428 650 3 1342 1150 1127 1406
469 234 432 645 3 1098 809 986 1288
470 240 430 647 3 1404 907 1219 1377
476 243 434 649 3 1318 919 1185 1389
494 262 452 677 2 847 544 859 1133
1 (PURPLE) = high average temps & low precipitation
2 (BLUE) = mid average temps & mid precipitation
3 (RED) = low average temps & high precipitation


par(mfrow = c(2, 2))
boxplot(V1 ~ K$clust, data_elevation, main = "March-May Seasonal Precipitation", col = c("pink", "light green", "light blue"))
boxplot(V2 ~ K$clust, data_elevation, main = "June-Aug Seasonal Precipitation",col = c("pink", "light green", "light blue"))
boxplot(V3 ~ K$clust, data_elevation, main = "Sept-Nov Seasonal Precipitation",col = c("pink", "light green", "light blue"))
boxplot(V4 ~ K$clust, data_elevation, main = "Dec-Feb Seasonal Precipitation", col = c("pink", "light green", "light blue"))

par(mfrow = c(2, 2))
boxplot(V5 ~ K$clust, data_elevation, main = "Autumn Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V6 ~ K$clust, data_elevation, main = "Winter Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V7 ~ K$clust, data_elevation, main = "Spring Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V8 ~ K$clust, data_elevation, main = "Summer Average Temperature", col = c("pink", "light green", "light blue"))


Pittsburgh = 95 PITTSBURGH INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)
State College = 119 STATE COLLEGE PA US (RED = LOW TEMP)
Philadelpha = 89 PHILADELPHIA INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)


MANOVA of Regions

manova_region <- manova(cbind(V1,V2,V3,V4,V5,V6,V7,V8) ~ K$clust, data_elevation)
manova_region
## Call:
##    manova(cbind(V1, V2, V3, V4, V5, V6, V7, V8) ~ K$clust, data_elevation)
## 
## Terms:
##                   K$clust Residuals
## resp 1              30568   1976383
## resp 2           135230.1   36476.5
## resp 3            13696.7 1521321.8
## resp 4           164690.5   43141.1
## resp 5           474686.7 1706155.5
## resp 6           168846.0   42257.9
## resp 7            17844.9 1937509.5
## resp 8           175925.0   44805.5
## Deg. of Freedom         1       136
## 
## Residual standard errors: 120.5497 16.37711 105.7648 17.81051 112.0056 17.62725 119.3583 18.15082
## Estimated effects may be unbalanced
summary(manova_region, test = "Wilks")
##            Df  Wilks approx F num Df den Df    Pr(>F)    
## K$clust     1 0.1815   72.716      8    129 < 2.2e-16 ***
## Residuals 136                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1