Import libraries we will use for this lab
library(cluster)
library(fpc)
library(fields)
library(maps)
library(spam)
Set working directory and read boston6k data
setwd("/Users/annapeterson/Desktop/Classes/GEOG6000/lab07")
boston = read.csv("/Users/annapeterson/Desktop/Classes/GEOG6000/lab07/boston6k.csv")
Prep the data for k-means cluster analysis by extracting and then scaling data
boston_use = boston[, seq(9,21)]
boston_scale = scale(boston_use)
Create blank vectors (ch_out) then use a for loop to iterate kmeans. The loop stores our kmeans so we are able to test different choices of k
library(fpc)
ch_out = rep(NA,20)
for (i in 2:20) {
boston_k = kmeans(boston_scale, centers = i, nstart = 50)
ch_out[i] = calinhara(boston_scale, boston_k$cluster)
}
Now we plot to see which cluster is best. This will be the highest
peak. I had to use dev.new to open plot because the usual plot window
was squishing the plot but after clearing the environment it worked
Looks like 2 clusters (k = 2) is the highest peak and our
answer. In my opinion this is the best solution because there is such a
drastic difference between the peak at 2 and the rest of the
clusters.
Re-running kmeans using 2
k = 2
boston_use_k = kmeans(boston_scale, k, nstart = 50, iter.max = 20)
table(boston_use_k$cluster)
##
## 1 2
## 177 329
Using the aggregate function to provide a table showing the median variables used in clustering.
boston_centers = aggregate(boston_use, list(boston_use_k$cluster), mean)
boston_centers
## Group.1 CRIM ZN INDUS CHAS NOX RM AGE
## 1 1 9.8447302 0.0000 19.039718 0.06779661 0.6805028 5.967181 91.31808
## 2 2 0.2611723 17.4772 6.885046 0.06990881 0.4870112 6.455422 56.33921
## DIS RAD TAX PTRATIO B LSTAT
## 1 2.007242 18.988701 605.8588 19.60452 301.3317 18.572768
## 2 4.756868 4.471125 301.9179 17.83739 386.4479 9.468298
Cluster 2 has higher crime, industry, age, rad (i’m not sure what
this is because I don’t use census data), property taxes, and lstat.
Whereas Cluster 1 has higher zn, rm, dis, and b. The other columns
didn’t have as notable of a difference (e.g. chas).
Report mean corrected house value per cluster
tapply(boston$CMEDV,boston_use_k$cluster, mean)
## 1 2
## 16.52712 25.75775
Cluster 1 has a higher median house price.
Using anova to test the significant difference between clusters
fit = aov(boston_use_k$cluster ~ CMEDV, data = boston)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## CMEDV 1 26.50 26.504 150.8 <2e-16 ***
## Residuals 504 88.58 0.176
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The f-value is 150.8 and the p-value is <2 x
10-16.
Read in data
wna = read.csv("/Users/annapeterson/Desktop/Classes/GEOG6000/lab07/wnaclim2.csv")
Same approach as exercise 1
wna_use = wna[, seq(3, 26)]
wna_scale = scale(wna_use)
SVD approach and make a biplot and screeplot
wna_pca = prcomp(wna_use, scale = TRUE)
summary(wna_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.4612 2.8065 1.5846 0.84733 0.63986 0.51316 0.29261
## Proportion of Variance 0.4992 0.3282 0.1046 0.02992 0.01706 0.01097 0.00357
## Cumulative Proportion 0.4992 0.8274 0.9320 0.96190 0.97896 0.98993 0.99350
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.20707 0.17437 0.15014 0.13534 0.10798 0.09353 0.08322
## Proportion of Variance 0.00179 0.00127 0.00094 0.00076 0.00049 0.00036 0.00029
## Cumulative Proportion 0.99528 0.99655 0.99749 0.99825 0.99874 0.99910 0.99939
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.06116 0.05676 0.04584 0.04161 0.03439 0.02946 0.02562
## Proportion of Variance 0.00016 0.00013 0.00009 0.00007 0.00005 0.00004 0.00003
## Cumulative Proportion 0.99955 0.99968 0.99977 0.99984 0.99989 0.99993 0.99995
## PC22 PC23 PC24
## Standard deviation 0.02411 0.01802 0.01392
## Proportion of Variance 0.00002 0.00001 0.00001
## Cumulative Proportion 0.99998 0.99999 1.00000
Total variance and percentage
variance = summary(wna_pca)
axes_1 = variance$importance[2,1]
axes_2 = variance$importance[2,2]
total = sum(variance$importance[2,])
percent = ((axes_1 + axes_2)/total) * 100
axes_2
## [1] 0.32819
percent
## [1] 82.73517
The total variance in the second PCA is 0.32819 and the total percentage of variance explained by axes 1 and 2 is 82.73%.
Examining the loadings of the variables on the first two axes (found in rotation)
loadings = wna_pca$rotation
loadings
## PC1 PC2 PC3 PC4 PC5
## tjan 0.27154635 -0.07007925 0.080386417 -0.1634473814 0.29901716
## tfeb 0.27524689 -0.07830351 0.051384208 -0.0792929685 0.26587174
## tmar 0.27484658 -0.09367701 -0.001067899 0.0006366441 0.19086484
## tapr 0.26473010 -0.12257641 -0.076298000 0.1052955242 0.03981628
## tmay 0.23877755 -0.16889072 -0.127271902 0.1251598792 -0.21848109
## tjun 0.20220435 -0.21701738 -0.139841033 0.0811805985 -0.44268300
## tjul 0.20390908 -0.22851811 -0.090041235 0.0981226254 -0.35337585
## taug 0.22863878 -0.20271341 -0.057726655 0.1081761983 -0.22777782
## tsep 0.25186014 -0.16881584 -0.031188367 -0.0252319485 -0.11100418
## toct 0.26714245 -0.12423680 -0.014213457 0.0039586317 0.12864795
## tnov 0.27214789 -0.09552584 0.029032040 -0.0776651476 0.26276447
## tdec 0.27301687 -0.07380880 0.068758839 -0.1472904833 0.28523902
## pjan 0.16803736 0.25995212 0.198293078 -0.0491302786 -0.14878411
## pfeb 0.17102216 0.26393484 0.174815508 -0.0779200398 -0.13546314
## pmar 0.17796187 0.25650454 0.169544577 -0.0684787805 -0.08941670
## papr 0.17074339 0.27824680 0.066073520 0.0873621609 -0.01709438
## pmay 0.14748106 0.25905950 -0.096201569 0.4454588155 0.21475276
## pjun 0.05730680 0.23790590 -0.271290553 0.6551603362 0.14238968
## pjul 0.01369406 0.10097126 -0.573535723 -0.1391139500 0.07054361
## paug 0.03206302 0.14782502 -0.519764989 -0.4064516753 0.01206271
## psep 0.10354548 0.26528778 -0.300542026 -0.1481576621 -0.04780745
## poct 0.13939969 0.28759576 -0.081905529 -0.1479311802 -0.16441200
## pnov 0.16303542 0.27829605 0.141217106 -0.0566592085 -0.16857230
## pdec 0.16860662 0.26639834 0.170927455 -0.0622112513 -0.15532094
## PC6 PC7 PC8 PC9 PC10
## tjan -0.0284108206 0.006685677 0.150748941 -0.03370819 0.295282973
## tfeb 0.0003503783 0.136618117 0.096325727 0.05374427 0.146770345
## tmar 0.0472553831 0.299836795 0.107879864 0.12796171 -0.002868388
## tapr 0.1142053327 0.361840941 0.005051047 0.18027931 -0.245606706
## tmay 0.1162681105 0.391610249 0.152998679 0.14131428 -0.153895952
## tjun 0.0336846593 0.059413350 0.153361302 0.08729649 0.185251391
## tjul -0.0968873526 -0.282122219 0.078633470 -0.02281332 0.111072241
## taug -0.1100855056 -0.269202717 -0.078353138 -0.16060487 0.002348289
## tsep -0.0494334800 -0.197844681 -0.135371606 -0.17154105 0.019210005
## toct -0.0108001461 -0.228312237 -0.393469988 -0.13609013 -0.315583081
## tnov -0.0341735253 -0.123895015 -0.174524477 -0.10198795 -0.168381612
## tdec -0.0376637342 -0.070659399 0.019616400 -0.04699441 0.196876851
## pjan 0.2458067285 0.032089376 -0.048645619 -0.21338581 0.001100698
## pfeb 0.1820140769 -0.040185881 -0.011301000 0.07554311 -0.126403221
## pmar 0.2288319443 -0.177170757 0.160725487 0.13544683 -0.309717304
## papr -0.0925172437 -0.305189314 0.271689204 0.39781282 -0.204745692
## pmay -0.1948655497 -0.258064679 0.241986147 0.13372542 0.183453709
## pjun 0.1213474134 0.141181422 -0.086059500 -0.36227611 0.043038183
## pjul 0.4811346946 -0.176731790 -0.383344278 0.38567024 0.178533100
## paug 0.0604580646 -0.072069255 0.509760728 -0.38851386 -0.032443473
## psep -0.4843787725 0.196824537 -0.149665107 -0.09624515 -0.407497122
## poct -0.4759093700 0.159625862 -0.266370371 0.26037169 0.346421923
## pnov 0.0644439203 0.128788161 -0.117454901 -0.17604085 0.237115136
## pdec 0.1871590345 0.094644125 -0.105030745 -0.21895144 0.170211499
## PC11 PC12 PC13 PC14 PC15
## tjan -0.3177818979 0.084717452 -0.198073796 0.0306128664 -0.12524773
## tfeb -0.0850920746 0.207170982 0.215049963 -0.0248132032 0.07177408
## tmar 0.0837891432 0.151198192 0.338194830 -0.1213396869 -0.04343164
## tapr 0.2429592526 -0.018873516 0.244346289 0.0007484506 0.04381860
## tmay 0.0431667637 -0.232783794 -0.291244671 0.0480458427 -0.18616504
## tjun -0.0987876247 -0.033840012 -0.422444222 -0.0105642695 0.31526141
## tjul -0.0486904501 0.283065952 0.298939526 -0.0044454340 0.14450174
## taug -0.0152404625 0.136111286 0.291708371 -0.0496293283 -0.21486141
## tsep -0.0444107836 -0.120942065 -0.094460420 0.0209592856 -0.41993874
## toct 0.2093561076 -0.231921667 -0.011130377 0.0997939913 0.27607606
## tnov 0.1006998972 -0.233238733 -0.174605090 -0.0378639293 0.02149880
## tdec -0.1909224250 0.002929791 -0.196357064 0.0633928569 0.15143733
## pjan -0.0005762763 0.039714713 0.006401917 -0.3431345819 0.49814932
## pfeb -0.1595286785 0.149113176 0.045518203 0.2980140260 0.07726715
## pmar -0.0295711250 0.194223506 -0.064458463 0.5326818242 -0.13153712
## papr -0.3048488853 -0.403344025 0.133363364 -0.4467304855 -0.08102153
## pmay 0.5520813847 0.203821338 -0.246609418 0.0048849326 0.02882245
## pjun -0.4115111702 -0.123357939 0.112057802 0.1724675213 0.02767334
## pjul -0.0309151580 0.137303662 -0.039627958 -0.1174288399 -0.07339510
## paug 0.1609697226 -0.224188960 0.158706743 0.0989438683 0.06477934
## psep -0.1703085895 0.441341096 -0.234474281 -0.2164132305 -0.05426607
## poct 0.0248821147 -0.304950065 0.205141127 0.3425036906 0.16345751
## pnov 0.1821268769 -0.095799139 0.015392954 -0.1184016248 -0.41165840
## pdec 0.1898709898 0.053178843 -0.055907092 -0.2039300913 -0.10002376
## PC16 PC17 PC18 PC19 PC20
## tjan 0.148674822 -0.107041343 0.160823567 -0.0269825468 0.4272858093
## tfeb -0.017002651 0.234367308 -0.030332072 0.0627361960 0.0203446585
## tmar -0.035352024 -0.000212998 -0.023742455 -0.0452842790 -0.2687836322
## tapr 0.022030975 0.145866305 0.006613803 -0.0184863673 -0.0467386911
## tmay 0.142255714 -0.298105316 0.126282045 0.0006562435 0.3387549738
## tjun -0.160935797 0.249990410 -0.191135211 0.0289864553 -0.2723565962
## tjul -0.238251861 -0.162051895 -0.033972474 0.1319965479 0.2737881475
## taug 0.106065552 -0.250950490 0.035001101 -0.1434759326 -0.0549829727
## tsep 0.390707549 0.327566676 0.244781447 -0.0916673160 -0.3538791803
## toct 0.018144251 0.349455508 -0.075056635 0.0788151267 0.4142142645
## tnov -0.264971834 -0.588989556 -0.143864858 0.0332551632 -0.3245451022
## tdec -0.157870134 0.092966911 -0.097722453 0.0092514596 -0.1416159968
## pjan 0.206879187 -0.141990574 0.530872235 0.0890666735 -0.0947536147
## pfeb 0.530999363 -0.151466383 -0.533289719 0.2207678709 -0.0675137256
## pmar -0.380592388 0.074655676 0.335006369 -0.1362476881 -0.0616168429
## papr -0.056901612 0.106755938 -0.084786785 -0.0375141498 0.0485745440
## pmay 0.177758906 -0.022014450 0.025024070 0.0376327473 -0.0093294644
## pjun -0.094904165 0.013377094 -0.016298342 -0.0261197497 -0.0346808089
## pjul -0.005543386 -0.049203713 0.044870954 0.0112968029 0.0096717843
## paug 0.047348123 0.015708070 -0.052098593 0.0046396787 0.0009636633
## psep -0.040657537 0.041970363 -0.011347466 0.0246969815 0.0132694536
## poct 0.027120678 -0.090538295 0.148875670 -0.1375823255 -0.0299283689
## pnov -0.283906262 0.105283956 -0.052183791 0.6219718414 0.0532889852
## pdec -0.122022282 0.048156966 -0.325075890 -0.6730608587 0.1687923951
## PC21 PC22 PC23 PC24
## tjan 0.110569448 -0.174362955 0.466461484 0.1410538810
## tfeb 0.411083337 0.195963582 -0.553989382 0.3303973610
## tmar 0.190517645 -0.117562888 0.219088103 -0.6511559490
## tapr -0.404433770 -0.108135234 0.270516403 0.5155541679
## tmay -0.037106821 0.143632598 -0.348991274 -0.1986853412
## tjun 0.254278010 0.063546212 0.239743583 0.0692545700
## tjul -0.179763796 -0.459394939 -0.200353045 -0.0523957985
## taug 0.026308260 0.644090923 0.223105697 0.0795298706
## tsep -0.041652632 -0.338752088 -0.179241161 -0.0007117746
## toct 0.159245830 0.068434255 0.090617300 -0.2078886850
## tnov 0.183602516 -0.198760485 -0.051768256 0.1951652783
## tdec -0.671182075 0.274168418 -0.167825018 -0.2131251230
## pjan -0.025665936 0.023245016 -0.023533194 -0.0018978335
## pfeb -0.017169890 -0.043847725 0.001437087 -0.0140109546
## pmar 0.073977731 0.042827913 0.030519916 -0.0138651710
## papr -0.008348023 0.008701196 -0.001837501 0.0129567032
## pmay -0.007890496 0.009175983 -0.014398707 -0.0095682867
## pjun 0.019589810 -0.050004246 -0.001155038 -0.0001735104
## pjul -0.003853114 0.023997896 -0.014147625 -0.0044508995
## paug 0.004197034 0.012388955 0.011866533 0.0056221737
## psep -0.017989679 -0.007676584 -0.001546521 -0.0008137329
## poct 0.026887105 -0.009846198 -0.006560772 0.0004372792
## pnov -0.015092274 0.074152159 0.081791929 0.0084156928
## pdec -0.018484850 -0.087548286 -0.071983057 0.0171497426
Hopefully I’m interpreting this properly. PC1 is associated with
January temperatures (tjan) and December temperature (tdec) at
0.272 and 0.273, respectively. PC2 is associated with
October precipitation (poct) and November precipitation (pnov) at
0.288 and 0.278, respectively.
Use the quilt plot to produce site scores
We know that PC1 is associated with temperature based on the loadings
answer we had above. We can interpret the map as it getting cold at
higher latitudes and the temperature is less variable along the coast
than it is inland for a vast majority of the United States, except for
Alaska (minus the Inside Passage). PC2 is associated with precipitation
and this follows what I know about the PNW where it rains all the time
and places like the U.S. southwest is very dry.