PA Climate Analysis

# Required libraries
library(MASS)
library(car)
library(lattice)
library(cluster)
library(fpc)
library(GPArotation)
library(FactoMineR)
#library(psych)
#library(nFactors)

Preview PA Climate Data

data<-read.csv("C:\\Users\\Jenn\\Documents\\R\\Multivariate\\paclimate.csv",header = TRUE)
climate <- data[6:13]

# Should we standardize our data?
variables<- var(climate)
# standardize data
scaled_climate <- scale(climate)

# Preview first 10 obs
knitr::kable(head(climate, n=10), digits = 3)

SON.PRCP.NORMAL	SON.TAVG.NORMAL	MAM.PRCP.NORMAL	MAM.TAVG.NORMAL	JJA.PRCP.NORMAL	JJA.TAVG.NORMAL	DJF.PRCP.NORMAL	DJF.TAVG.NORMAL
1233	464	1134	412	1499	634	923	224
1119	460	978	418	1342	631	779	223
1182	453	1122	410	1452	624	934	218
1115	462	1108	421	1332	630	902	229
1129	467	1033	419	1245	640	757	239
1342	469	1127	428	1406	650	1150	230
1098	469	986	432	1288	645	809	234
1404	470	1219	430	1377	647	907	240
1318	476	1185	434	1389	649	919	243
847	494	859	452	1133	677	544	262

Scatterplot Matrix

# Scatterplot Matrix
par(mfrow = c(1, 1))
scatterplotMatrix(climate)

Principal Component Analysis

# Principal Component Analysis
climate.pca <- prcomp(scaled_climate)
summary(climate.pca)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0728 1.6918 0.66277 0.53339 0.30038 0.11114
## Proportion of Variance 0.5371 0.3578 0.05491 0.03556 0.01128 0.00154
## Cumulative Proportion  0.5371 0.8948 0.94974 0.98530 0.99658 0.99812
##                            PC7     PC8
## Standard deviation     0.10430 0.06424
## Proportion of Variance 0.00136 0.00052
## Cumulative Proportion  0.99948 1.00000

# Screeplot
screeplot(climate.pca, type="lines")

# Principal Component Factor Estimation
pc<-prcomp(climate,center=TRUE, scale=TRUE) #correlation PCA
pc$sdev

## [1] 2.07282889 1.69175128 0.66277180 0.53338878 0.30038343 0.11114463
## [7] 0.10429955 0.06423563

sum((pc$sdev)^2 > mean((pc$sdev)^2)) # two factors should be used (Kaiser's rule)

## [1] 2

Factor Loadings & Rotations

# no rotation
pcfactorloadings <- pc$rotation[,1:2]
print(pc$rotation[,1:2],digits = 2, cutoff=.3)

##                    PC1   PC2
## SON.PRCP.NORMAL -0.104 0.490
## SON.TAVG.NORMAL  0.476 0.052
## MAM.PRCP.NORMAL  0.055 0.535
## MAM.TAVG.NORMAL  0.478 0.051
## JJA.PRCP.NORMAL -0.272 0.394
## JJA.TAVG.NORMAL  0.475 0.040
## DJF.PRCP.NORMAL  0.060 0.555
## DJF.TAVG.NORMAL  0.477 0.057

load <- pcfactorloadings[,1:2]
plot(load,type="n",main="PCF,no rotation") # set up plot 
text(load,labels=names(climate),cex=.7) # add variable names

# varimax rotation
rot<-varimax(pcfactorloadings, normalize = FALSE)
print(rot$loadings,digits = 2, cutoff=.3)

## 
## Loadings:
##                 PC1   PC2  
## SON.PRCP.NORMAL        0.50
## SON.TAVG.NORMAL  0.48      
## MAM.PRCP.NORMAL        0.53
## MAM.TAVG.NORMAL  0.48      
## JJA.PRCP.NORMAL        0.41
## JJA.TAVG.NORMAL  0.48      
## DJF.PRCP.NORMAL        0.55
## DJF.TAVG.NORMAL  0.48      
## 
##                 PC1  PC2
## SS loadings    1.00 1.00
## Proportion Var 0.12 0.12
## Cumulative Var 0.12 0.25

load <- rot$loadings[,1:2] 
plot(load,type="n",main="PCF,varimax") # set up plot 
text(load,labels=names(climate),cex=.7) # add variable names

Boxplots for Seasonal Precipitation & Temperature

data_elevation <- read.csv("C:\\Users\\Jenn\\Documents\\R\\Multivariate\\paclimateHIGHMIDLOW.csv", header = TRUE)

# MANOVA
# 3 elevations, n = 139.
attach(data_elevation) 


# Boxplots
par(mfrow = c(2, 2))
boxplot(V1 ~ group, data_elevation, main = "March-May Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V2 ~ group, data_elevation, main = "June-Aug Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V3 ~ group, data_elevation, main = "Sept-Nov Seasonal Precipitation",col = c("pink", " purple", "light blue"))
boxplot(V4 ~ group, data_elevation, main = "Dec-Feb Seasonal Precipitation",col = c("pink", " purple", "light blue"))

par(mfrow = c(2, 2))
boxplot(V5 ~ group, data_elevation, main = "Autumn Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V6 ~ group, data_elevation, main = "Winter Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V7 ~ group, data_elevation, main = "Spring Average Temperature",col = c("pink", " purple", "light blue"))
boxplot(V8 ~ group, data_elevation, main = "Summer Average Temperature",col = c("pink", " purple", "light blue"))

MANOVA & Univariate ANOVA Tables

# MANOVA
manova <- manova(cbind(V1,V2,V3,V4,V5,V6,V7,V8) ~ group, data_elevation)
manova

## Call:
##    manova(cbind(V1, V2, V3, V4, V5, V6, V7, V8) ~ group, data_elevation)
## 
## Terms:
##                     group Residuals
## resp 1            46682.6 1960267.8
## resp 2             4716.2  166990.5
## resp 3            95662.7 1439355.8
## resp 4             7131.2  200700.4
## resp 5            55015.8 2125826.3
## resp 6             5308.2  205795.8
## resp 7            93387.4 1861967.1
## resp 8             6566.0  214164.5
## Deg. of Freedom         2       135
## 
## Residual standard errors: 120.501 35.17053 103.2565 38.55736 125.4865 39.04374 117.4408 39.82968
## Estimated effects may be unbalanced

# Univariate ANOVA Tables
summary(manova, test = "Wilks")

##            Df   Wilks approx F num Df den Df  Pr(>F)  
## group       2 0.83453   1.5145     16    256 0.09448 .
## Residuals 135                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(manova, test = "Roy")

##            Df     Roy approx F num Df den Df  Pr(>F)  
## group       2 0.14513   2.3402      8    129 0.02214 *
## Residuals 135                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(manova, test = "Pillai")

##            Df  Pillai approx F num Df den Df  Pr(>F)  
## group       2 0.17109   1.5084     16    258 0.09651 .
## Residuals 135                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(manova, test = "Hotelling-Lawley")

##            Df Hotelling-Lawley approx F num Df den Df  Pr(>F)  
## group       2          0.19154   1.5203     16    254 0.09257 .
## Residuals 135                                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary.aov(manova)        # univariate ANOVA tables

##  Response V1 :
##              Df  Sum Sq Mean Sq F value Pr(>F)
## group         2   46683   23341  1.6075 0.2042
## Residuals   135 1960268   14520               
## 
##  Response V2 :
##              Df Sum Sq Mean Sq F value Pr(>F)
## group         2   4716  2358.1  1.9064 0.1526
## Residuals   135 166990  1237.0               
## 
##  Response V3 :
##              Df  Sum Sq Mean Sq F value  Pr(>F)  
## group         2   95663   47831  4.4862 0.01299 *
## Residuals   135 1439356   10662                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V4 :
##              Df Sum Sq Mean Sq F value  Pr(>F)  
## group         2   7131  3565.6  2.3984 0.09473 .
## Residuals   135 200700  1486.7                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V5 :
##              Df  Sum Sq Mean Sq F value Pr(>F)
## group         2   55016   27508  1.7469 0.1782
## Residuals   135 2125826   15747               
## 
##  Response V6 :
##              Df Sum Sq Mean Sq F value Pr(>F)
## group         2   5308  2654.1   1.741 0.1792
## Residuals   135 205796  1524.4               
## 
##  Response V7 :
##              Df  Sum Sq Mean Sq F value  Pr(>F)  
## group         2   93387   46694  3.3855 0.03676 *
## Residuals   135 1861967   13792                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response V8 :
##              Df Sum Sq Mean Sq F value Pr(>F)
## group         2   6566  3283.0  2.0695 0.1302
## Residuals   135 214164  1586.4

Cluster Plot by Low, Mid, & High Elevation

# Clustering 
elevationHIGH <- subset(data_elevation,select = c(V1:V8), group == "HIGH")
elevationMID <- subset(data_elevation,select = c(V1:V8), group == "MID")
elevationLOW <- subset(data_elevation,select = c(V1:V8), group == "LOW")

cluster <- c(data_elevation[4], nrow=1, byrow=TRUE)
par(mfrow = c(1, 1))
clusplot(climate, cluster$group, color=TRUE, shade=TRUE, labels=2, lines=0)

# Does not appear any distinct clusters. BAD

Cluster Analysis

# Gap Statistics     
gap <- clusGap(data_elevation[7:14], FUN = kmeans, K.max = 8)
plot(gap)    # there appears to be 3 clusters

# Euclidean Distance 
scaled_climate <- scale(climate)

# dissimilarity:  2-norm 
d_Euclidean <- dist(scaled_climate, method="euclidean")
# K-Means Cluster Analysis  (based on Euclidean distance)
KmeansE <- kmeans(d_Euclidean, 3) # k=3 clusters 
KmeansE$cluster

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   3   3   3   3   3   3   3   3   2   3   2   3   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   3   2   2   2   2   2   2   1   1   1   1   2   2   2   2   2   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   1   1   1   2   1   1   1   2   1   1   1   2   1   1   1   2   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   2   1   3   2   1   2   1   2   2   1   1   2   1   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   2   1   1   2   2   2   1   3   2   2   2   2   3   3   3   3   2   2 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   2   2   1   2   3   2   2   3   2   2   3   2   2   3   3   2   2   3 
## 127 128 129 130 131 132 133 134 135 136 137 138 
##   3   3   3   3   2   3   3   3   3   3   3   3

# dissimilarity:  1-norm 
d_Manhattan <- dist(scaled_climate, method="manhattan")
# K-Means Cluster Analysis  (based on Manhattan distance)
KmeansM <- kmeans(d_Manhattan, 3) # k=3 clusters 
KmeansM$cluster

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   1   1   1   1   1   1   1   1   1   1   1   3   1   1   3   3   3   3 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   3   1   3   3   3   3   3   3   2   2   2   2   3   3   3   3   3   2 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   2   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   3   2   2   2   3   2   2   2   3   2   2   2   3   2   2   2   3   2 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   2   3   2   1   3   2   3   2   3   3   2   2   3   2   3   3   3   3 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   3   2   2   3   3   3   2   1   3   3   1   3   3   1   1   1   3   3 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   3   3   2   3   1   3   3   1   3   3   1   3   3   1   1   3   3   1 
## 127 128 129 130 131 132 133 134 135 136 137 138 
##   1   1   1   1   3   1   1   1   1   1   1   1

# cluster plot
par(mfrow = c(1, 1))
clusplot(scaled_climate, KmeansE$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)

clusplot(scaled_climate, KmeansM$cluster, color=TRUE, shade=TRUE, labels=2, lines=0)

Comparing Temperatures & Precipation

# comparison - TEMPERATURES
compare_temp <- data.frame(data_elevation[8], data_elevation[14], data_elevation[10], data_elevation[12],KmeansE$cluster)
# Autumn temp avg, Winter temp avg, Spring temp avg, Summer temp avg
head(compare_temp, n=10)

##     V2  V8  V4  V6 KmeansE.cluster
## 1  464 224 412 634               3
## 2  460 223 418 631               3
## 3  453 218 410 624               3
## 4  462 229 421 630               3
## 5  467 239 419 640               3
## 6  469 230 428 650               3
## 7  469 234 432 645               3
## 8  470 240 430 647               3
## 9  476 243 434 649               3
## 10 494 262 452 677               2

# 3 = low average temps (RED),  #2 mid average temps (BLUE),  #1 = high average temps  (PURPLE)


# comparison - Precipitation
compare_precip <- data.frame(data_elevation[7], data_elevation[13], data_elevation[9], data_elevation[11],KmeansE$cluster)
# Autumn precip avg, Winter precip avg, Spring precip avg, Summer precip avg
head(compare_precip, n=10)  # print first 10 lines

##      V1   V7   V3   V5 KmeansE.cluster
## 1  1233  923 1134 1499               3
## 2  1119  779  978 1342               3
## 3  1182  934 1122 1452               3
## 4  1115  902 1108 1332               3
## 5  1129  757 1033 1245               3
## 6  1342 1150 1127 1406               3
## 7  1098  809  986 1288               3
## 8  1404  907 1219 1377               3
## 9  1318  919 1185 1389               3
## 10  847  544  859 1133               2

# Preciptitation does not give much information to analysis
# 3 = high precip (RED),  #2 mid precip (BLUE),  #1 = low precip  (PURPLE)

# It appears this is a more appropriate clustering scheme

K-Means Clustering (Euclidean distance)

K <- kmeans(d_Euclidean, 3) # k=3 clusters 
K$clust

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
##   3   3   3   3   3   3   3   3   3   2   3   2   3   2   2   2   2   2 
##  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##   2   3   2   2   2   2   2   2   1   1   1   1   2   2   2   2   2   1 
##  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##   1   2   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72 
##   2   1   1   1   2   1   1   1   2   1   1   1   2   1   1   1   2   1 
##  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   2   1   3   2   1   2   1   2   2   1   1   2   1   2   2   2   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 
##   2   1   1   2   2   2   1   3   2   2   2   2   3   3   3   3   2   2 
## 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 
##   2   2   1   2   3   2   2   3   2   2   3   2   2   3   3   2   2   3 
## 127 128 129 130 131 132 133 134 135 136 137 138 
##   3   3   3   3   2   3   3   3   3   3   3   3

compare <- data.frame(data[7], data[13], data[9], data[11],K$clust,data[6],data[12], data[8], data[10])
# Autumn temp avg, Winter temp avg, Spring temp avg, Summer temp avg,  CLUSTERING Autumn precip avg, Winter precip avg, Spring precip avg, Summer precip avg

# print first 10 lines
knitr::kable(head(head(compare, n=10), n=10), digits = 3)

SON.TAVG.NORMAL	DJF.TAVG.NORMAL	MAM.TAVG.NORMAL	JJA.TAVG.NORMAL	K.clust	SON.PRCP.NORMAL	DJF.PRCP.NORMAL	MAM.PRCP.NORMAL	JJA.PRCP.NORMAL
464	224	412	634	3	1233	923	1134	1499
460	223	418	631	3	1119	779	978	1342
453	218	410	624	3	1182	934	1122	1452
462	229	421	630	3	1115	902	1108	1332
467	239	419	640	3	1129	757	1033	1245
469	230	428	650	3	1342	1150	1127	1406
469	234	432	645	3	1098	809	986	1288
470	240	430	647	3	1404	907	1219	1377
476	243	434	649	3	1318	919	1185	1389
494	262	452	677	2	847	544	859	1133

1 (PURPLE) = high average temps & low precipitation

2 (BLUE) = mid average temps & mid precipitation

3 (RED) = low average temps & high precipitation

par(mfrow = c(2, 2))
boxplot(V1 ~ K$clust, data_elevation, main = "March-May Seasonal Precipitation", col = c("pink", "light green", "light blue"))
boxplot(V2 ~ K$clust, data_elevation, main = "June-Aug Seasonal Precipitation",col = c("pink", "light green", "light blue"))
boxplot(V3 ~ K$clust, data_elevation, main = "Sept-Nov Seasonal Precipitation",col = c("pink", "light green", "light blue"))
boxplot(V4 ~ K$clust, data_elevation, main = "Dec-Feb Seasonal Precipitation", col = c("pink", "light green", "light blue"))

par(mfrow = c(2, 2))
boxplot(V5 ~ K$clust, data_elevation, main = "Autumn Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V6 ~ K$clust, data_elevation, main = "Winter Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V7 ~ K$clust, data_elevation, main = "Spring Average Temperature", col = c("pink", "light green", "light blue"))
boxplot(V8 ~ K$clust, data_elevation, main = "Summer Average Temperature", col = c("pink", "light green", "light blue"))

Pittsburgh = 95 PITTSBURGH INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)

State College = 119 STATE COLLEGE PA US (RED = LOW TEMP)

Philadelpha = 89 PHILADELPHIA INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)

MANOVA of Regions

manova_region <- manova(cbind(V1,V2,V3,V4,V5,V6,V7,V8) ~ K$clust, data_elevation)
manova_region

## Call:
##    manova(cbind(V1, V2, V3, V4, V5, V6, V7, V8) ~ K$clust, data_elevation)
## 
## Terms:
##                   K$clust Residuals
## resp 1              30568   1976383
## resp 2           135230.1   36476.5
## resp 3            13696.7 1521321.8
## resp 4           164690.5   43141.1
## resp 5           474686.7 1706155.5
## resp 6           168846.0   42257.9
## resp 7            17844.9 1937509.5
## resp 8           175925.0   44805.5
## Deg. of Freedom         1       136
## 
## Residual standard errors: 120.5497 16.37711 105.7648 17.81051 112.0056 17.62725 119.3583 18.15082
## Estimated effects may be unbalanced

summary(manova_region, test = "Wilks")

##            Df  Wilks approx F num Df den Df    Pr(>F)    
## K$clust     1 0.1815   72.716      8    129 < 2.2e-16 ***
## Residuals 136                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

PA Climate Analysis

Jenn Murphy

December 27, 2015

Preview PA Climate Data

Scatterplot Matrix

Principal Component Analysis

Factor Loadings & Rotations

Boxplots for Seasonal Precipitation & Temperature

MANOVA & Univariate ANOVA Tables

Cluster Plot by Low, Mid, & High Elevation

Cluster Analysis

Comparing Temperatures & Precipation

K-Means Clustering (Euclidean distance)

1 (PURPLE) = high average temps & low precipitation

2 (BLUE) = mid average temps & mid precipitation

3 (RED) = low average temps & high precipitation

Pittsburgh = 95 PITTSBURGH INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)

State College = 119 STATE COLLEGE PA US (RED = LOW TEMP)

Philadelpha = 89 PHILADELPHIA INTERNATIONAL AIRPORT PA US (PURPLE = Mid temp)

MANOVA of Regions