climate <- read.csv("NASAUnderstory.csv", header = TRUE)
str(climate)
## 'data.frame':    63 obs. of  32 variables:
##  $ PlotID           : int  2 3 12 14 15 16 18 19 20 21 ...
##  $ Overstory.Species: Factor w/ 2 levels "Aspen","Spruce": 1 2 1 1 1 2 1 1 2 2 ...
##  $ SPHA             : int  68 0 60 16 68 0 62 62 0 0 ...
##  $ BLIT             : int  14 6 1 14 7 14 7 5 5 16 ...
##  $ ASMA             : int  0 18 0 0 0 14 0 0 8 26 ...
##  $ MOSS             : int  3 2 0 72 30 3 3 0 14 5 ...
##  $ LEGR             : int  33 0 5 14 27 0 6 6 0 0 ...
##  $ CHCA             : int  5 0 9 1 8 0 12 28 0 0 ...
##  $ GRAS             : int  5 5 12 2 4 2 4 1 4 1 ...
##  $ SEDG             : int  0 0 1 0 0 0 0 0 6 0 ...
##  $ SMTR             : int  14 0 14 13 12 0 0 0 0 0 ...
##  $ PTAQ             : int  0 2 0 0 0 1 0 0 14 18 ...
##  $ COCA             : int  0 0 0 0 0 0 0 0 10 6 ...
##  $ VAAN             : int  4 0 0 5 5 0 0 0 3 10 ...
##  $ GAHI             : int  28 0 2 7 13 0 0 0 0 0 ...
##  $ ARNU             : int  0 7 0 0 0 2 0 0 0 10 ...
##  $ LYOB             : int  0 0 0 0 0 1 0 0 6 11 ...
##  $ PIMA             : int  1 0 3 2 3 0 13 15 0 0 ...
##  $ RUBU             : int  0 6 0 0 0 1 0 0 0 3 ...
##  $ VAOX             : int  6 0 4 1 3 0 5 5 0 0 ...
##  $ ACSP             : int  0 2 0 0 0 17 0 0 1 1 ...
##  $ COCO             : int  0 1 0 0 0 6 0 0 8 5 ...
##  $ ACRU             : int  0 2 0 0 0 10 0 0 1 5 ...
##  $ TRBO             : int  0 2 0 0 0 1 0 0 1 4 ...
##  $ MACA             : int  0 2 0 0 0 0 0 0 0 4 ...
##  $ CLBO             : int  0 1 0 0 0 3 0 0 3 2 ...
##  $ STRO             : int  0 1 0 0 0 4 0 0 0 0 ...
##  $ FUNG             : int  1 1 4 0 4 1 1 0 0 0 ...
##  $ DILO             : int  0 3 0 0 0 1 0 0 0 1 ...
##  $ ERIO             : int  0 0 3 0 0 0 11 10 0 0 ...
##  $ GATR             : int  0 4 0 0 0 2 0 0 0 0 ...
##  $ Labels           : Factor w/ 30 levels "","Bedstraw (Narrow Leaves)",..: 26 8 3 23 18 19 15 24 5 6 ...
summary(climate)
##      PlotID       Overstory.Species      SPHA            BLIT     
##  Min.   :  2.00   Aspen :30         Min.   : 0.00   Min.   : 0.0  
##  1st Qu.: 44.00   Spruce:33         1st Qu.: 0.00   1st Qu.: 8.0  
##  Median : 69.00                     Median : 2.00   Median :14.0  
##  Mean   : 63.49                     Mean   :28.65   Mean   :16.7  
##  3rd Qu.: 87.50                     3rd Qu.:61.00   3rd Qu.:24.0  
##  Max.   :105.00                     Max.   :86.00   Max.   :46.0  
##                                                                   
##       ASMA             MOSS             LEGR            CHCA       
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 3.000   1st Qu.: 0.00   1st Qu.: 0.000  
##  Median : 2.000   Median : 5.000   Median : 0.00   Median : 0.000  
##  Mean   : 9.714   Mean   : 9.048   Mean   : 6.73   Mean   : 3.714  
##  3rd Qu.:16.500   3rd Qu.:10.000   3rd Qu.: 8.50   3rd Qu.: 5.000  
##  Max.   :48.000   Max.   :72.000   Max.   :36.00   Max.   :28.000  
##                                                                    
##       GRAS             SEDG             SMTR             PTAQ       
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 1.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 4.000   Median : 1.000   Median : 0.000   Median : 0.000  
##  Mean   : 3.714   Mean   : 3.619   Mean   : 2.444   Mean   : 2.317  
##  3rd Qu.: 5.000   3rd Qu.: 2.500   3rd Qu.: 3.500   3rd Qu.: 2.500  
##  Max.   :17.000   Max.   :34.000   Max.   :20.000   Max.   :20.000  
##                                                                     
##       COCA             VAAN             GAHI             ARNU       
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 1.000   Median : 0.000   Median : 0.000  
##  Mean   : 2.175   Mean   : 2.095   Mean   : 1.984   Mean   : 1.952  
##  3rd Qu.: 4.000   3rd Qu.: 4.000   3rd Qu.: 3.000   3rd Qu.: 4.000  
##  Max.   :17.000   Max.   :10.000   Max.   :28.000   Max.   :14.000  
##                                                                     
##       LYOB             PIMA             RUBU           VAOX      
##  Min.   : 0.000   Min.   : 0.000   Min.   :0.00   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.:0.00   1st Qu.:0.000  
##  Median : 0.000   Median : 0.000   Median :0.00   Median :0.000  
##  Mean   : 1.794   Mean   : 1.746   Mean   :1.46   Mean   :1.444  
##  3rd Qu.: 3.000   3rd Qu.: 2.500   3rd Qu.:3.00   3rd Qu.:3.000  
##  Max.   :11.000   Max.   :15.000   Max.   :8.00   Max.   :6.000  
##                                                                  
##       ACSP             COCO            ACRU             TRBO      
##  Min.   : 0.000   Min.   :0.000   Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 0.000   1st Qu.:0.000   1st Qu.: 0.000   1st Qu.:0.000  
##  Median : 0.000   Median :0.000   Median : 0.000   Median :0.000  
##  Mean   : 1.413   Mean   :1.413   Mean   : 1.365   Mean   :1.238  
##  3rd Qu.: 1.000   3rd Qu.:2.000   3rd Qu.: 2.000   3rd Qu.:2.000  
##  Max.   :17.000   Max.   :9.000   Max.   :10.000   Max.   :5.000  
##                                                                   
##       MACA            CLBO            STRO            FUNG       
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.000   Median :0.000   Median :0.000   Median :0.0000  
##  Mean   :1.206   Mean   :1.143   Mean   :1.048   Mean   :0.9683  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :6.000   Max.   :6.000   Max.   :5.000   Max.   :5.0000  
##                                                                  
##       DILO             ERIO              GATR       
##  Min.   :0.0000   Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median : 0.0000   Median :0.0000  
##  Mean   :0.9048   Mean   : 0.8889   Mean   :0.8095  
##  3rd Qu.:1.0000   3rd Qu.: 0.0000   3rd Qu.:1.0000  
##  Max.   :8.0000   Max.   :24.0000   Max.   :4.0000  
##                                                     
##                       Labels  
##                          :34  
##  Bedstraw (Narrow Leaves): 1  
##  Big-leaved Aster        : 1  
##  Blue-bead Lily          : 1  
##  Bog False Solomon Seal  : 1  
##  Bracken Fern            : 1  
##  (Other)                 :24
climate.pca <- climate[3:13]
str(climate.pca)
## 'data.frame':    63 obs. of  11 variables:
##  $ SPHA: int  68 0 60 16 68 0 62 62 0 0 ...
##  $ BLIT: int  14 6 1 14 7 14 7 5 5 16 ...
##  $ ASMA: int  0 18 0 0 0 14 0 0 8 26 ...
##  $ MOSS: int  3 2 0 72 30 3 3 0 14 5 ...
##  $ LEGR: int  33 0 5 14 27 0 6 6 0 0 ...
##  $ CHCA: int  5 0 9 1 8 0 12 28 0 0 ...
##  $ GRAS: int  5 5 12 2 4 2 4 1 4 1 ...
##  $ SEDG: int  0 0 1 0 0 0 0 0 6 0 ...
##  $ SMTR: int  14 0 14 13 12 0 0 0 0 0 ...
##  $ PTAQ: int  0 2 0 0 0 1 0 0 14 18 ...
##  $ COCA: int  0 0 0 0 0 0 0 0 10 6 ...
library(corrplot) #correlation plot
library(FactoMineR) #additional PCA analysis
library(ggplot2) #support scatterplot
library(GPArotation) #supports rotation
library(psych) #PCA package
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pca.df = as.data.frame(lapply(climate.pca , as.numeric))
str(pca.df)
## 'data.frame':    63 obs. of  11 variables:
##  $ SPHA: num  68 0 60 16 68 0 62 62 0 0 ...
##  $ BLIT: num  14 6 1 14 7 14 7 5 5 16 ...
##  $ ASMA: num  0 18 0 0 0 14 0 0 8 26 ...
##  $ MOSS: num  3 2 0 72 30 3 3 0 14 5 ...
##  $ LEGR: num  33 0 5 14 27 0 6 6 0 0 ...
##  $ CHCA: num  5 0 9 1 8 0 12 28 0 0 ...
##  $ GRAS: num  5 5 12 2 4 2 4 1 4 1 ...
##  $ SEDG: num  0 0 1 0 0 0 0 0 6 0 ...
##  $ SMTR: num  14 0 14 13 12 0 0 0 0 0 ...
##  $ PTAQ: num  0 2 0 0 0 1 0 0 14 18 ...
##  $ COCA: num  0 0 0 0 0 0 0 0 10 6 ...
climate.cor = cor(pca.df)
climate.cor
##            SPHA       BLIT        ASMA         MOSS        LEGR
## SPHA  1.0000000 -0.5864081 -0.72062426  0.124570398  0.70679705
## BLIT -0.5864081  1.0000000  0.29235971 -0.102323955 -0.44056347
## ASMA -0.7206243  0.2923597  1.00000000 -0.312032088 -0.50488086
## MOSS  0.1245704 -0.1023240 -0.31203209  1.000000000  0.30140755
## LEGR  0.7067971 -0.4405635 -0.50488086  0.301407554  1.00000000
## CHCA  0.6471222 -0.5468535 -0.43789936 -0.118197447  0.43763540
## GRAS  0.1459794 -0.2318856  0.02902742 -0.154977268  0.11862005
## SEDG  0.3948176 -0.3409134 -0.31500679  0.004050561  0.02280618
## SMTR  0.4948647 -0.2859442 -0.43409381  0.434961499  0.47030932
## PTAQ -0.4644532  0.0814870  0.54833695 -0.177306278 -0.32568366
## COCA -0.5727870  0.2859041  0.30578500 -0.093797842 -0.39140393
##             CHCA        GRAS         SEDG        SMTR        PTAQ
## SPHA  0.64712217  0.14597942  0.394817616  0.49486474 -0.46445318
## BLIT -0.54685346 -0.23188560 -0.340913352 -0.28594421  0.08148700
## ASMA -0.43789936  0.02902742 -0.315006793 -0.43409381  0.54833695
## MOSS -0.11819745 -0.15497727  0.004050561  0.43496150 -0.17730628
## LEGR  0.43763540  0.11862005  0.022806176  0.47030932 -0.32568366
## CHCA  1.00000000  0.09652412  0.387791822  0.08712632 -0.28139831
## GRAS  0.09652412  1.00000000 -0.270929392  0.05968249 -0.07908224
## SEDG  0.38779182 -0.27092939  1.000000000  0.20951973 -0.16144737
## SMTR  0.08712632  0.05968249  0.209519733  1.00000000 -0.27861933
## PTAQ -0.28139831 -0.07908224 -0.161447367 -0.27861933  1.00000000
## COCA -0.37854163 -0.08575924 -0.212353610 -0.32627244  0.38300912
##             COCA
## SPHA -0.57278704
## BLIT  0.28590411
## ASMA  0.30578500
## MOSS -0.09379784
## LEGR -0.39140393
## CHCA -0.37854163
## GRAS -0.08575924
## SEDG -0.21235361
## SMTR -0.32627244
## PTAQ  0.38300912
## COCA  1.00000000
corrplot(climate.cor, method="ellipse")

pca = principal(pca.df, nfactors=3, rotate="none")
pca
## Principal Components Analysis
## Call: principal(r = pca.df, nfactors = 3, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##        PC1   PC2   PC3   h2   u2 com
## SPHA  0.93 -0.13  0.02 0.89 0.11 1.0
## BLIT -0.64  0.37 -0.07 0.54 0.46 1.6
## ASMA -0.77 -0.21  0.13 0.66 0.34 1.2
## MOSS  0.30  0.78  0.02 0.69 0.31 1.3
## LEGR  0.75  0.13  0.29 0.67 0.33 1.4
## CHCA  0.68 -0.53 -0.14 0.76 0.24 2.0
## GRAS  0.12 -0.34  0.80 0.77 0.23 1.4
## SEDG  0.44 -0.20 -0.73 0.76 0.24 1.8
## SMTR  0.61  0.47  0.11 0.60 0.40 2.0
## PTAQ -0.58 -0.19 -0.03 0.37 0.63 1.2
## COCA -0.63  0.07 -0.06 0.40 0.60 1.0
## 
##                        PC1  PC2  PC3
## SS loadings           4.27 1.51 1.32
## Proportion Var        0.39 0.14 0.12
## Cumulative Var        0.39 0.53 0.65
## Proportion Explained  0.60 0.21 0.19
## Cumulative Proportion 0.60 0.81 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.09 
##  with the empirical chi square  54.78  with prob <  0.00053 
## 
## Fit based upon off diagonal values = 0.94
plot(pca$values, type="b", ylab="Eigenvalues", xlab="Component")

pca.rotate = principal(pca.df, nfactors=3, rotate = "varimax")
pca.rotate
## Principal Components Analysis
## Call: principal(r = pca.df, nfactors = 3, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##        RC1   RC2   RC3   h2   u2 com
## SPHA  0.83  0.45 -0.01 0.89 0.11 1.5
## BLIT -0.72 -0.10 -0.09 0.54 0.46 1.1
## ASMA -0.50 -0.61  0.19 0.66 0.34 2.2
## MOSS -0.22  0.79 -0.10 0.69 0.31 1.2
## LEGR  0.51  0.59  0.23 0.67 0.33 2.3
## CHCA  0.86 -0.04 -0.09 0.76 0.24 1.0
## GRAS  0.26 -0.08  0.83 0.77 0.23 1.2
## SEDG  0.50 -0.01 -0.71 0.76 0.24 1.8
## SMTR  0.20  0.75  0.01 0.60 0.40 1.1
## PTAQ -0.35 -0.50  0.03 0.37 0.63 1.8
## COCA -0.54 -0.32 -0.04 0.40 0.60 1.6
## 
##                        RC1  RC2  RC3
## SS loadings           3.30 2.48 1.33
## Proportion Var        0.30 0.23 0.12
## Cumulative Var        0.30 0.52 0.65
## Proportion Explained  0.46 0.35 0.19
## Cumulative Proportion 0.46 0.81 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.09 
##  with the empirical chi square  54.78  with prob <  0.00053 
## 
## Fit based upon off diagonal values = 0.94
pca.scores = pca.rotate$scores
pca.scores = as.data.frame(pca.scores)
pca.scores
##              RC1         RC2         RC3
## 1   0.6746270655  1.52571290  1.08125879
## 2  -0.0005187963 -0.67710138  0.58751345
## 3   1.2505676033  0.44738160  1.80080885
## 4  -1.5871923206  3.75027363 -0.16914394
## 5   0.3783449500  2.14098082  0.65842418
## 6  -0.2912059263 -0.40191019 -0.05841779
## 7   1.1369877401 -0.35179497  0.25595647
## 8   1.9507719628 -0.89503434 -0.40385762
## 9  -0.5899014698 -0.80529802 -0.30836841
## 10 -0.8663244944 -1.26773374 -0.21470331
## 11 -0.8370119007 -1.13165113  0.33100881
## 12  1.2554953666  0.71307103  1.33000712
## 13 -0.0582159826  2.14933954  0.82217709
## 14  0.2221669566  0.55861023  0.14879868
## 15 -0.7088492746  1.79711070 -0.79649374
## 16 -0.4007935838  2.05922844 -0.12693704
## 17  0.1970853748  0.46148099 -1.22830671
## 18  1.2623408447  0.98803856 -2.38037273
## 19  0.3191284524  1.04322429  1.29745574
## 20  0.6283101258  0.08729548 -0.55051953
## 21 -0.6753065576  1.68566995 -0.59700857
## 22  1.6691854324  0.22258563  0.26500776
## 23  1.9531761336  0.12515625  3.18313043
## 24  0.9257286632  0.46899267  0.76965843
## 25 -0.6113630273  1.13742305 -0.76805936
## 26  0.7853430147  0.74153031 -0.26351473
## 27  1.9332969999 -0.48405144 -1.32716327
## 28  2.1668620428 -0.65321362  2.12745775
## 29  1.2330934295 -0.35489338  0.73512618
## 30  2.5117466563 -1.15825535 -3.00766316
## 31  0.7391614691  0.02437610 -1.01714445
## 32 -1.0992510155 -0.59979284 -0.34533100
## 33 -0.6703757886 -0.54553667 -0.30361429
## 34 -0.9414885522 -0.21692934 -0.43013890
## 35 -0.8974621120 -0.36225594  0.17879402
## 36 -0.5321911654 -0.40457463  0.26709428
## 37 -0.4923353417 -0.36807193  0.47963932
## 38 -1.2050412414  0.17253275 -0.55241275
## 39 -0.7112278489 -0.66773767 -0.07696602
## 40 -0.0005187963 -0.67710138  0.58751345
## 41 -0.7630744286 -0.44970109  0.33256080
## 42 -0.6064997041 -0.90348074  0.74707358
## 43 -0.5006733806 -0.45345396  0.40721421
## 44 -0.7351537859 -1.21643199 -0.28078065
## 45 -0.8335117101 -0.59151860  0.32848724
## 46 -0.6408058601 -0.51407868  0.30369686
## 47 -0.6255011437 -0.05688314 -0.06960956
## 48 -1.1023861430 -0.13679733 -0.41148168
## 49 -1.1452247129  0.31335124 -0.60201579
## 50 -0.6175372206 -0.29621244  0.26012667
## 51 -0.7580922954 -0.31509523  0.26227034
## 52 -0.5931202721 -0.51613988  0.16898898
## 53 -0.3699647673 -1.11760080  0.52827934
## 54 -0.3358055646 -0.83530333  0.93557378
## 55 -0.9972653950 -0.70206891  0.03211676
## 56 -0.5266589205 -1.62765331  0.17983169
## 57 -0.4235953093 -1.05265948  0.36395095
## 58 -1.0962057864 -0.69135822  0.22011849
## 59  0.4042507987  0.13527944 -2.01746563
## 60 -0.6339547011  0.40752078 -0.79628120
## 61  0.6798212632  0.18516287  0.26253607
## 62  1.7866102311 -0.46008745 -2.28902015
## 63  0.4175037205  0.61813327 -0.84686458
climate$SPHA = as.numeric(climate$SPHA)
climate$PC1 = pca.scores$RC1
climate$PC2 = pca.scores$RC2
climate$PC3= pca.scores$RC3

#Pincipal Components Regression
climate.lm = lm(SPHA~PC1+PC2+PC3, data=climate)
summary(climate.lm)
## 
## Call:
## lm(formula = SPHA ~ PC1 + PC2 + PC3, data = climate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.6240  -6.3251  -0.2731   7.5362  29.8318 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   28.651      1.359  21.075  < 2e-16 ***
## PC1           25.699      1.370  18.754  < 2e-16 ***
## PC2           14.056      1.370  10.257 9.85e-15 ***
## PC3           -0.295      1.370  -0.215     0.83    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.79 on 59 degrees of freedom
## Multiple R-squared:  0.8856, Adjusted R-squared:  0.8798 
## F-statistic: 152.3 on 3 and 59 DF,  p-value: < 2.2e-16
climate.lm2 = lm(SPHA~PC1+PC2, data=climate)
summary(climate.lm2)
## 
## Call:
## lm(formula = SPHA ~ PC1 + PC2, data = climate)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.5741  -6.6156  -0.4673   7.5511  29.7879 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   28.651      1.349   21.25  < 2e-16 ***
## PC1           25.699      1.359   18.91  < 2e-16 ***
## PC2           14.056      1.359   10.34 5.95e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.7 on 60 degrees of freedom
## Multiple R-squared:  0.8856, Adjusted R-squared:  0.8817 
## F-statistic: 232.1 on 2 and 60 DF,  p-value: < 2.2e-16
plot(climate.lm2$fitted.values, 
     climate$SPHA, main="Predicted versus Actual",xlab="Predicted",ylab="Actual")

climate$pred = climate.lm2$fitted.values
climate=climate[order(-climate$SPHA),]
climate.best = climate[1:15,]



p2 = ggplot(climate, aes(x=PC1, y=PC2, label = Labels))
p2 + geom_point() +
geom_text(size=3, hjust=.2, vjust=-0.5, angle=0) +
xlim(-3,3) + ylim(-3,3)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).