##R setup with packages for this lab
library(ggplot2)
library(ggpubr)
library(plotly)
library(maps) #Displaying geographic data
library(cluster) #Cluster analysis package
library(fpc) #Flexible procedures for clustering
library(fields) #For working with spatial data
library(RColorBrewer) #Adding additional color palettes
setwd("~/Desktop/University of Utah PhD /Course Work/Fall 2022 Semester/GEOG6000_Data Analysis/lab07")
cars = read.csv("../datafiles/cars.tab.csv")
cars.use = cars[ , -c(1,2)]
cars.use = scale(cars.use) ##standardizes to zero mean and unit variance and converts each observation to a number of standard deviations away from the mean.
#What is an atomic vector?? The scaled data??
par(mfrow = c(2,1))
hist(cars[, "Horsepower"], main = "Horsepower: Original Scale")
hist(cars.use[, "Horsepower"], main = "Horsepower: Standardized Scale")
par(mfrow = c(2, 1))
##dist() function to explore the multivariate distances between different cars.
dist(cars.use[1:4, ], method = 'euclidean')
## 1 2 3
## 2 1.1398481
## 3 1.8791302 1.5379233
## 4 0.8681787 0.6873475 1.5065957
dist(cars.use[1:4, ], method = 'manhattan')
## 1 2 3
## 2 2.057498
## 3 3.816091 3.367820
## 4 1.681029 1.390295 2.785084
cars.dist = dist(cars.use)
cars.dist
## 1 2 3 4 5 6 7
## 2 1.1398481
## 3 1.8791302 1.5379233
## 4 0.8681787 0.6873475 1.5065957
## 5 6.5064614 6.4822961 5.1250923 6.3106342
## 6 5.1738042 5.0499221 3.8001685 4.9053397 1.8047804
## 7 5.5864282 5.6036243 4.3134390 5.4016818 1.2565441 1.0332439
## 8 6.1771844 6.0905876 4.7485982 5.9244671 0.7161097 1.2442626 1.0742030
## 9 4.8480549 5.0375972 3.8035553 4.8669738 2.3525992 2.1291365 1.6171913
## 10 3.5301708 3.7291650 2.6017142 3.5792512 3.5736168 2.6364429 2.6015184
## 11 4.9604936 5.1492658 3.9973838 4.9557693 2.3895043 1.8575477 1.3722800
## 12 3.3450719 3.6729516 2.6806678 3.5119282 3.9725777 3.0332883 2.9915492
## 13 3.0274764 2.7708193 1.5960990 2.7535396 3.7854036 2.3935780 3.0058421
## 14 3.9438666 3.7671366 2.4605709 3.7360814 2.8933537 1.9550300 2.3505016
## 15 2.7559271 2.5273572 1.4813731 2.5708289 4.1358672 2.7303071 3.3172245
## 16 2.4939094 2.2706162 1.3471376 2.2403426 4.3163472 2.8785776 3.4066196
## 17 1.4399982 0.8377919 0.7252240 1.0165631 5.7430227 4.3656972 4.9093636
## 18 1.7032564 0.9301185 0.7761175 1.1413136 5.7509851 4.3340633 4.9252674
## 19 1.2528877 0.2565576 1.3733210 0.6668156 6.3095525 4.8865115 5.4507553
## 20 1.2632414 0.7830240 0.8012940 0.7558885 5.7850435 4.4033377 4.9308012
## 21 5.2096828 5.0678428 3.8157918 4.9496450 1.6987044 0.3202520 1.0654982
## 22 3.7264974 3.6468502 2.2880581 3.5151842 2.9564835 1.7292372 2.0723052
## 23 6.9749041 6.9555364 5.5791028 6.7513604 0.7010289 2.2372431 1.7489593
## 24 6.4973291 6.3107820 4.9978582 6.1292098 1.7078072 1.6400328 1.8531241
## 25 5.4455210 5.2977556 4.0184339 5.1982029 1.5478613 0.6090464 1.2158603
## 26 6.7281549 6.7264044 5.3582423 6.5269933 0.3924042 2.0229042 1.3853449
## 27 6.2516001 6.0358394 4.7099391 5.9344674 1.2582953 1.2889114 1.5271840
## 28 5.1216822 4.8108545 3.6618306 4.7373244 2.5974309 1.0604331 2.0770315
## 29 4.1658959 3.9606554 2.6594600 3.7477335 3.1072917 1.6871993 2.3063279
## 30 3.9568990 3.8096329 2.4692785 3.6037691 3.0034600 1.5925941 2.1310964
## 31 5.4980541 5.2816705 4.0886676 5.1187130 2.3371058 1.1814421 1.9882024
## 32 6.4810802 6.3980437 5.0519523 6.2111056 0.9131638 1.6397765 1.5291032
## 33 6.8079280 6.7733290 5.3989540 6.5929239 0.3825189 2.0451145 1.5351385
## 34 6.7428379 6.5979100 5.2646847 6.4146012 1.6110434 1.9546073 2.0929158
## 35 6.3291275 6.3396799 4.9879301 6.1412506 0.3891058 1.6270124 0.9753294
## 36 4.5310683 4.6441960 3.2882995 4.4744300 2.3271473 1.9982563 1.7047030
## 37 5.0541206 5.1541528 3.9678998 4.9833487 2.1643824 1.5764399 1.1034142
## 38 6.7948022 6.7891309 5.4196563 6.5851179 0.4869201 2.0802109 1.4464682
## 8 9 10 11 12 13 14
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9 2.4465576
## 10 3.4329055 1.5014408
## 11 2.3609068 0.8441935 1.7107844
## 12 3.8451138 1.8036762 0.5255723 1.9532352
## 13 3.3916875 2.7160211 1.9461582 2.8241709 2.2036527
## 14 2.6358656 2.0212696 1.8617590 2.3614848 2.2640202 1.1593258
## 15 3.7620304 2.8598682 1.9126425 2.9541083 2.0712991 0.5003866 1.4839209
## 16 3.9437224 2.9561575 1.8881155 3.0078993 2.0407394 0.7494156 1.7424394
## 17 5.3618900 4.3540948 3.0918327 4.5183655 3.1008994 2.0619163 3.0077541
## 18 5.3391575 4.4700081 3.2336889 4.6116307 3.2820741 2.0714550 3.0411856
## 19 5.9121360 4.9282952 3.6531977 5.0486036 3.6284414 2.6105509 3.6010959
## 20 5.4002110 4.3981637 3.1371642 4.5417264 3.1329324 2.1526641 3.1127122
## 21 1.1892967 2.0608821 2.6384138 1.8743475 3.0478077 2.3563277 1.8038583
## 22 2.6248563 1.7958162 1.3032347 1.9312784 1.7386606 1.1905000 1.0047733
## 23 1.0204637 2.9535998 4.1460710 2.9540731 4.5385930 4.2941996 3.4462460
## 24 1.1176406 3.3610043 4.0842400 3.1635894 4.5058524 3.7556124 3.2240898
## 25 1.0272409 2.2123917 2.8808062 2.0843180 3.2789801 2.5547655 1.9284228
## 26 0.8979276 2.5433909 3.7767096 2.5459045 4.1703806 4.0638586 3.1984995
## 27 0.7226258 2.8135472 3.6275852 2.7364046 4.0695291 3.3295052 2.6087595
## 28 1.9380432 3.0442380 3.2111523 2.8027787 3.5918097 2.2947691 2.1662867
## 29 2.5543986 2.8421867 2.5390312 2.7460931 2.9094252 1.8375723 1.9726533
## 30 2.5124425 2.4557730 2.0997795 2.4032013 2.4798109 1.5608200 1.6455885
## 31 1.6387728 3.2186993 3.5788929 2.9627948 3.9517206 2.8169783 2.5990762
## 32 0.5399900 2.8951122 3.8782418 2.8172944 4.2791128 3.7347060 3.0063712
## 33 0.8409380 2.6961640 3.9043616 2.7177677 4.3053914 4.0843817 3.2041305
## 34 1.1505897 3.5150291 4.3530385 3.3837812 4.7533608 4.0043668 3.3981805
## 35 0.6590117 2.1966443 3.3872917 2.1515380 3.7731194 3.6747621 2.8546416
## 36 2.3239170 0.8284025 1.4496585 1.4566540 1.8538570 2.2807905 1.4742083
## 37 2.0955544 0.9293048 1.7974353 0.4185086 2.1255921 2.7296457 2.2054957
## 38 0.9445111 2.6284194 3.8529893 2.6245026 4.2496167 4.1342707 3.2746391
## 15 16 17 18 19 20 21
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16 0.6135479
## 17 1.8523603 1.6612209
## 18 1.9038060 1.7164735 0.3484249
## 19 2.4022245 2.1432108 0.6896808 0.7442321
## 20 1.9721830 1.7262461 0.3112976 0.4990653 0.6197894
## 21 2.6970359 2.8720900 4.3735675 4.3486218 4.9026316 4.4256685
## 22 1.4643705 1.5574226 2.9028656 2.9276306 3.4979870 2.9585980 1.7360496
## 23 4.6691439 4.8427560 6.2148382 6.2075757 6.7729493 6.2406187 2.1896065
## 24 4.1601229 4.3029176 5.6084492 5.5235895 6.1172285 5.6294160 1.7273423
## 25 2.8872833 3.1291034 4.5853313 4.5589804 5.1302201 4.6490493 0.4130040
## 26 4.4209623 4.5764749 5.9890670 5.9944815 6.5523856 6.0187706 1.9654050
## 27 3.7063800 3.9155152 5.3046264 5.2447759 5.8512434 5.3707284 1.1890624
## 28 2.6437355 2.8307240 4.1643391 4.0630542 4.6330340 4.2186003 1.1173955
## 29 2.2156802 2.2263245 3.2752130 3.1804593 3.7742653 3.2678478 1.8515108
## 30 1.9244810 1.9416802 3.1008883 3.0456623 3.6340760 3.1046275 1.7263843
## 31 3.2043755 3.3602005 4.6324672 4.5398596 5.0925051 4.6450790 1.3165811
## 32 4.1237846 4.3128218 5.6710966 5.6389351 6.2104759 5.6973056 1.6161605
## 33 4.4473685 4.6289818 6.0286118 6.0253378 6.5951862 6.0669668 1.9667330
## 34 4.4047432 4.6039058 5.8810526 5.8127673 6.4018476 5.9033347 1.9892055
## 35 4.0185214 4.1758140 5.6107231 5.6215377 6.1714477 5.6390014 1.5802116
## 36 2.5011695 2.6304055 3.9000247 3.9936022 4.5083666 3.9492086 1.9208685
## 37 2.9037674 2.9578242 4.5033203 4.5699679 5.0407289 4.5386834 1.5785176
## 38 4.4968791 4.6449792 6.0525231 6.0544862 6.6134046 6.0798614 2.0305467
## 22 23 24 25 26 27 28
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23 3.4695843
## 24 3.0862555 1.5893300
## 25 1.9739830 2.0167725 1.6235892
## 26 3.1851298 0.4691402 1.6994356 1.8416265
## 27 2.6810030 1.5170768 1.0286972 0.9756777 1.4653244
## 28 2.0864283 2.9200943 1.8463675 1.1870636 2.8254805 1.6302850
## 29 1.3888145 3.3986058 2.4827899 2.0676480 3.2487747 2.5191228 1.6192914
## 30 0.9567097 3.3675915 2.6313055 1.9686886 3.1685641 2.5300071 1.7122281
## 31 2.4659351 2.4796851 1.2935247 1.3109424 2.4841532 1.5030967 0.8534042
## 32 3.0224100 0.8164795 0.9631897 1.4177507 0.9588893 0.9580309 2.1679101
## 33 3.2507493 0.3618590 1.6148982 1.7937362 0.2851090 1.3269554 2.7692642
## 34 3.3773124 1.3394565 0.6691116 1.7771885 1.5986781 1.1984472 2.1748994
## 35 2.7966626 0.8094290 1.6246464 1.4827076 0.4532477 1.3225642 2.4994953
## 36 1.3182650 2.8822778 3.1423812 2.0508011 2.5349410 2.6201402 2.7813383
## 37 1.8002414 2.7506750 2.8780007 1.8108202 2.3356329 2.4112481 2.5306662
## 38 3.2520390 0.4240946 1.6773707 1.9123832 0.1103997 1.4947954 2.8688498
## 29 30 31 32 33 34 35
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30 0.4464760
## 31 1.7406712 1.9194350
## 32 2.7848997 2.8042141 1.6824079
## 33 3.2848474 3.2218676 2.4278263 0.8318586
## 34 2.8599053 2.9846217 1.5536115 0.7295724 1.4493989
## 35 2.9065222 2.8009512 2.2045161 0.9086064 0.5976211 1.5967788
## 36 2.3532768 1.9696566 2.9421027 2.7289716 2.6452250 3.2869352 2.2009025
## 37 2.5584726 2.2356866 2.7234010 2.5836214 2.4942355 3.1418917 1.9433425
## 38 3.2824562 3.2131639 2.5086130 0.9674219 0.3093247 1.5799187 0.5369531
## 36 37
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37 1.4103762
## 38 2.6134362 2.4109537
##Use the distance matrix to perform hierarchical clustering for the cars dataset. Ward's method uses the distances between centroids of two clusters to assess similarity.
cars.hclust = hclust(cars.dist, method = 'ward.D2')
plot(cars.hclust,
labels = cars$Car,
main = 'Default from hclust')
plot(cars.hclust, labels = cars$Car, main = 'Default from hclust')
rect.hclust(cars.hclust, k = 3)
#Cutree function allows us to cut the tree to provide a set number of groups
groups.3 = cutree(cars.hclust, k = 3) #cutting the tree into 3 groups
table(groups.3) ##number of cars by group
## groups.3
## 1 2 3
## 8 17 13
cars$Car[groups.3 == 1] ##Which cars are in cluster 1
## [1] "Buick Estate Wagon" "Ford Country Squire Wagon"
## [3] "Chevy Malibu Wagon" "Chrysler LeBaron Wagon"
## [5] "Chevy Caprice Classic" "Ford LTD"
## [7] "Mercury Grand Marquis" "Dodge St Regis"
table(groups.3, cars$Country) ##checking to see if there is an association between the
##
## groups.3 France Germany Italy Japan Sweden U.S.
## 1 0 0 0 0 0 8
## 2 0 3 1 6 0 7
## 3 1 2 0 1 2 7
##Characterize the clusters based on the original information on the cars.
aggregate(cars[, -c(1,2)], list(groups.3), median)
## Group.1 MPG Weight Drive_Ratio Horsepower Displacement Cylinders
## 1 1 17.3 3.89 2.43 136.5 334 8
## 2 2 30.9 2.19 3.37 75.0 98 4
## 3 3 20.8 2.91 3.08 110.0 171 6
##use the cars data set (minus the first two rows), using the groups, calculate the median by group
##Utilizing the Western North America Climate Data set
##Ensure the maps package is loaded (initial start script for this lab)
##Read in the data.
wnaclim = read.csv("../datafiles/wnaclim2.csv")
wnaloc = cbind(wnaclim$Longitude, wnaclim$Latitude) #Combine the longitude and latitude into a new variable
ngrp = 6 #number of groups for k-means clustering
##Simple location map
plot(wnaloc,
xlab = "Longitude",
ylab = "Latitude") ##basic scatterplot
map(add = TRUE) ##adds the map to the current plot
##Extract the variables we need (temp and precip for all twelve months)
wnaclim = wnaclim[ , seq(3,26)] ##new dataframe with the temp and precip variables
wnaclim.s = scale(wnaclim) ##scale the data for clustering
wna.kmeans = kmeans(wnaclim.s, ngrp, nstart = 50, iter.max = 20) #kmeans set up. nstart and iter.max?
table(wna.kmeans$cluster)
##
## 1 2 3 4 5 6
## 59 251 556 563 265 723
##Plotting the distribution of k-means clusters
mycol = rainbow(ngrp)
plot(wnaloc,
xlab = "Longitude",
ylab = "Latitude",
main = "Western North America Climate Clusters",
pch = 16,
col = mycol[wna.kmeans$cluster])
map(add =TRUE)
#Prototypes are the centers of the clusters
wna.kmeans$centers #scaled values
## tjan tfeb tmar tapr tmay tjun
## 1 0.8696892 0.7934518 0.6850066 0.44533319 0.07647746 -0.3963639
## 2 1.2825180 1.2543921 1.2806068 1.37581299 1.59705382 1.8318690
## 3 -0.3761156 -0.2909757 -0.1935225 -0.07483776 -0.12525130 -0.2838725
## 4 -1.4292438 -1.5014047 -1.5383907 -1.51416330 -1.29643277 -1.0042640
## 5 0.8399177 0.7783219 0.6436532 0.39417236 0.05729332 -0.2802638
## 6 0.5781227 0.6074024 0.6103689 0.57818028 0.52417146 0.4994329
## tjul taug tsep toct tnov tdec pjan
## 1 -0.5721759 -0.29236349 0.1102208 0.4310103 0.6821368 0.8699948 2.8770982
## 2 1.7727118 1.68622243 1.7257717 1.4714932 1.3441639 1.3053881 -0.3630834
## 3 -0.3507435 -0.33572744 -0.3507177 -0.2101420 -0.2604109 -0.3602986 -0.2677048
## 4 -1.0448910 -1.20204332 -1.3024300 -1.4841307 -1.4998126 -1.4459498 -0.5811195
## 5 -0.2849497 -0.05094281 0.2264464 0.4708470 0.6530901 0.8092819 2.1507312
## 6 0.6190960 0.65134482 0.5927899 0.5986939 0.6064769 0.5822323 -0.2386507
## pfeb pmar papr pmay pjun pjul paug
## 1 3.2293089 3.0878762 3.8301542 3.62177688 2.7448484 2.1779382 3.2109996
## 2 -0.3446015 -0.3063620 -0.5330200 -0.78085302 -0.9738620 0.4792803 0.5274905
## 3 -0.2546389 -0.2304234 -0.1051083 0.29642889 0.7587406 0.8371566 0.5602187
## 4 -0.5970656 -0.6599811 -0.6771999 -0.80484085 -0.4849090 -0.2404945 -0.1520241
## 5 2.0448033 2.0401464 1.6629589 1.16116541 0.5528940 -0.6659339 -0.4584676
## 6 -0.2326141 -0.2022710 -0.1288679 -0.05129823 -0.2944393 -0.5565503 -0.5895535
## psep poct pnov pdec
## 1 4.9542629 5.0214147 3.5076945 3.1658742
## 2 -0.1840932 -0.3027824 -0.4102346 -0.3587566
## 3 0.2953857 -0.1190760 -0.2465633 -0.2561016
## 4 -0.4098152 -0.4293762 -0.5400784 -0.5803758
## 5 0.4577024 1.1450541 2.0002633 2.0964498
## 6 -0.4161739 -0.2984224 -0.2668071 -0.2533248
#Revert back to the original values using the aggregate function
wna.centers <- aggregate(wnaclim, list(wna.kmeans$cluster), mean)
wna.centers
## Group.1 tjan tfeb tmar tapr tmay tjun
## 1 1 -0.1474576 1.201695 2.572881 4.752542 7.9559322 10.884746
## 2 2 4.6573705 6.748606 9.155378 12.996813 17.5936255 22.765339
## 3 3 -14.6471223 -11.848201 -7.136511 0.143705 6.6773381 11.484532
## 4 4 -26.9042629 -26.414387 -21.999822 -12.609059 -0.7458259 7.643517
## 5 5 -0.4939623 1.019623 2.115849 4.299245 7.8343396 11.503774
## 6 6 -3.5409405 -1.037206 1.747994 5.929599 10.7934993 15.660996
## tjul taug tsep toct tnov tdec pjan
## 1 13.11017 13.352542 10.971186 7.033898 2.528814 0.3355932 297.11864
## 2 25.49482 24.309163 21.023108 15.462550 9.292032 5.0641434 30.27092
## 3 14.27968 13.112410 8.103237 1.840108 -7.100180 -13.0258993 38.12590
## 4 10.61350 8.315098 2.181705 -8.480107 -19.761812 -24.8165187 12.31439
## 5 14.62717 14.689434 11.694340 7.356604 2.232075 -0.3237736 237.29811
## 6 19.40194 18.578423 13.973721 8.392254 1.755878 -2.7896266 40.51867
## pfeb pmar papr pmay pjun pjul paug
## 1 252.677966 221.64407 184.084746 140.50847 112.83051 102.72881 136.13559
## 2 25.912351 30.29084 14.996016 11.39442 11.39044 56.69323 61.03187
## 3 31.620504 34.57194 31.579137 42.98741 58.65288 66.39209 61.94784
## 4 9.893428 10.35524 9.408526 10.69094 24.72824 37.18650 42.01421
## 5 177.520755 162.57736 100.098113 68.34717 53.03774 25.65660 33.43774
## 6 33.017981 36.15906 30.658368 32.78976 29.92393 28.62102 29.76902
## psep poct pnov pdec
## 1 223.18644 371.35593 334.55932 334.00000
## 2 32.84462 25.55777 24.11952 31.40637
## 3 50.60612 37.48921 37.08813 40.21942
## 4 24.48313 17.33570 13.83126 12.38011
## 5 56.61887 119.59245 215.11698 242.18868
## 6 24.24758 25.84094 35.48409 40.45781
temp.centers <- wna.centers[ , 2:13]
ppt.centers <- wna.centers[ , 14:25]
##Climatology plot for the clusters using matplot()
matplot(t(temp.centers), type = 'l', lwd = 2, lty = 2, col = mycol,
xlab = "Month", ylab = "Temp C")
matplot(t(ppt.centers), type = 'l', lwd = 2, lty = 2, col = mycol,
xlab = "Month", ylab = "PPT mm")
#why transpose? If you dont, the figure will use the cluster row # as the x - axis**
#cluster package calculates the silhouette index
#fpc pacakge calculates the Calinski-Harabasz index
#pacakges loaded at the beginning of this script
calinhara(wnaclim.s, wna.kmeans$cluster)
## [1] 1731.109
sil.out = silhouette(wna.kmeans$cluster, dist(wnaclim.s))
sil.out[1:4 , ] #rows 1 -4
## cluster neighbor sil_width
## [1,] 6 3 -0.002213331
## [2,] 4 3 0.652592256
## [3,] 3 6 0.473669262
## [4,] 4 3 0.646008575
#this index is between 0-1, but my values are higher?***
mean(sil.out[ , 3])
## [1] 0.3629048
tapply(sil.out[ , 3], sil.out[ , 1], mean)
## 1 2 3 4 5 6
## 0.3362736 0.2945774 0.2747485 0.5613447 0.4060656 0.2862479
##Saving to pdf - couldn't scale up in the original document
pdf()
plot(sil.out, col = mycol, main = "WNA Climate Silhoutte Plot")
invisible(dev.off()) #suppresses the device off message in the html output
Silhoutte Plot
#Trying the grid pacakage to embed the image
library(grid)
source("cluster_num_script.r")
##Calinski-Harabasz Index Plot
plot(1:20,ch.out, type = 'b', lwd = 2,
xlab = "N Groups", ylab = "C", main = "Calinski-Harabasz index")
#Silhoutte Plot
plot(1:20, sil.out, type = 'b', lwd = 2,
xlab = "N Groups",
ylab = "C",
main = " Average Silhoutte Index")
state = read.csv("../datafiles/statedata.csv")
state2 = state[ , -1] #remove the state names
rownames(state2) = state[ , 1] #sets the rownames by state instead of number. Essentially removes it as a variable
##the scale = true argument scales the data to account for a variety of units
state.pca = prcomp(state2, scale = TRUE)
summary(state.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8971 1.2775 1.0545 0.84113 0.62019 0.55449 0.38006
## Proportion of Variance 0.4499 0.2040 0.1390 0.08844 0.04808 0.03843 0.01806
## Cumulative Proportion 0.4499 0.6539 0.7928 0.88128 0.92936 0.96780 0.98585
## PC8
## Standard deviation 0.33643
## Proportion of Variance 0.01415
## Cumulative Proportion 1.00000
##Scaling the standard deviatioback up to the variance (which is the square of the SD)
state.pca$sdev^2
## [1] 3.5988956 1.6319192 1.1119412 0.7075042 0.3846417 0.3074617 0.1444488
## [8] 0.1131877
##Visualization tool using a screeplot
screeplot(state.pca)
##Rotation is the loadings between the individual variables and the new components
## High values indicate greater association. Direction can be positive or negative
state.pca$rotation
## PC1 PC2 PC3 PC4 PC5
## Population 0.12642809 0.41087417 -0.65632546 -0.40938555 0.405946365
## Income -0.29882991 0.51897884 -0.10035919 -0.08844658 -0.637586953
## Illiteracy 0.46766917 0.05296872 0.07089849 0.35282802 0.003525994
## Life.Exp -0.41161037 -0.08165611 -0.35993297 0.44256334 0.326599685
## Murder 0.44425672 0.30694934 0.10846751 -0.16560017 -0.128068739
## HS.Grad -0.42468442 0.29876662 0.04970850 0.23157412 -0.099264551
## Frost -0.35741244 -0.15358409 0.38711447 -0.61865119 0.217363791
## Area -0.03338461 0.58762446 0.51038499 0.20112550 0.498506338
## PC6 PC7 PC8
## Population -0.01065617 -0.062158658 -0.21924645
## Income 0.46177023 0.009104712 0.06029200
## Illiteracy 0.38741578 -0.619800310 -0.33868838
## Life.Exp 0.21908161 -0.256213054 0.52743331
## Murder -0.32519611 -0.295043151 0.67825134
## HS.Grad -0.64464647 -0.393019181 -0.30724183
## Frost 0.21268413 -0.472013140 0.02834442
## Area 0.14836054 0.286260213 0.01320320
##The loadings reflect both the strength and the direction of association (the direction of the eigenvectors), so life expectancy increases toward negative values of component 1, and area increases with positive values of component 2. We can use this to try and assign some meaning to the axes: e.g. axis 1 represents a social-economic gradient, and axis 2 is related the physical geography of the state.
##Site scores
state.pca$x [ , 1:4]
## PC1 PC2 PC3 PC4
## AL 3.78988728 -0.23477897 0.229317426 0.383268981
## AK -1.05313550 5.45617512 4.240590401 0.575673699
## AZ 0.86742876 0.74506148 0.077268654 1.718843204
## AR 2.38177761 -1.28834366 0.222792819 0.623207350
## CA 0.24138147 3.50952277 -2.806440773 -0.070375517
## CO -2.06218136 0.50566387 0.511384160 -0.109922581
## CT -1.89943583 -0.24300645 -0.662670280 0.169730671
## DE -0.42478394 -0.50791950 0.218723788 -0.192652755
## FL 1.17212341 1.13474136 -1.281840070 0.488495802
## GA 3.29417162 0.10995684 0.387068686 -0.455587595
## HI -0.48704129 0.12526216 -1.377335153 2.950230930
## ID -1.42342916 -0.61114319 0.434488061 0.405610358
## IL 0.11896424 1.28238783 -0.803855520 -1.582992442
## IN -0.47120189 -0.24520088 -0.301483896 -0.656489461
## IA -2.32181208 -0.53685609 -0.293246733 0.204744921
## KS -1.90151483 -0.07719072 -0.177111056 0.614073058
## KY 2.12935981 -1.06425233 0.251716929 -0.348839068
## LA 4.24100842 -0.34630079 0.228892174 0.879342117
## ME -0.96019374 -1.70241922 0.721478601 -0.545261667
## MD -0.20342599 0.38881112 -0.339225021 -0.661994566
## MA -1.19589376 -0.21865625 -1.019629690 0.290071067
## MI 0.18186944 0.84711636 -0.554194008 -1.182753855
## MN -2.43361605 -0.36533543 -0.272388339 0.067401738
## MS 4.03208863 -1.05124066 0.919582795 0.175909614
## MO 0.31125449 -0.14830589 0.006605131 -0.553888853
## MT -1.37887297 -0.03353877 1.339951830 -0.244949880
## NE -2.18101665 -0.54774825 0.055771873 0.455863194
## NV -1.12708455 1.13291366 1.890646998 -1.504838621
## NH -1.67128925 -1.31239813 0.437881704 -0.480357556
## NJ -0.64958222 0.28146986 -0.973902564 -0.616106237
## NM 1.32244692 -0.29357041 1.361964694 0.707016778
## NY 1.05034998 1.89371072 -2.197830728 -1.266966983
## NC 2.69433377 -0.51713890 0.142734146 -0.560640547
## ND -2.41766786 -0.78192203 0.276975457 -0.136309449
## OH -0.26795708 0.41685336 -1.032942807 -1.147560168
## OK 0.07391320 -0.64658337 -0.071187462 0.606914285
## OR -1.32472856 0.22767511 -0.498963071 1.350011371
## PA 0.07738173 0.26940938 -1.070097480 -1.289496583
## RI -0.74084731 -1.46130325 -0.227824040 0.296882798
## SC 3.71100631 -0.90984427 0.748732017 -0.315820402
## SD -2.01253414 -1.31509491 0.536546594 -0.157426050
## TN 2.21813394 -0.65102504 -0.016767256 0.002741771
## TX 2.41364282 2.32744119 -0.286040511 0.804077700
## UT -2.26283736 -0.53433138 0.219767287 0.850616612
## VT -1.36926611 -1.50938322 0.445702144 -0.359113201
## VA 0.99354796 0.18457034 -0.210828404 -0.324420224
## WA -1.34001299 0.51154448 -0.851811855 1.237439671
## WV 1.50662213 -1.60198375 0.492191330 -0.342121310
## WI -1.75754046 -0.63572738 -0.425370544 -0.112299153
## WY -1.48379101 0.04225606 1.354211561 -0.638982968
#Sorting for gradients
sort(state.pca$x[, 1])
## MN ND IA UT NE CO
## -2.43361605 -2.41766786 -2.32181208 -2.26283736 -2.18101665 -2.06218136
## SD KS CT WI NH WY
## -2.01253414 -1.90151483 -1.89943583 -1.75754046 -1.67128925 -1.48379101
## ID MT VT WA OR MA
## -1.42342916 -1.37887297 -1.36926611 -1.34001299 -1.32472856 -1.19589376
## NV AK ME RI NJ HI
## -1.12708455 -1.05313550 -0.96019374 -0.74084731 -0.64958222 -0.48704129
## IN DE OH MD OK PA
## -0.47120189 -0.42478394 -0.26795708 -0.20342599 0.07391320 0.07738173
## IL MI CA MO AZ VA
## 0.11896424 0.18186944 0.24138147 0.31125449 0.86742876 0.99354796
## NY FL NM WV KY TN
## 1.05034998 1.17212341 1.32244692 1.50662213 2.12935981 2.21813394
## AR TX NC GA SC AL
## 2.38177761 2.41364282 2.69433377 3.29417162 3.71100631 3.78988728
## MS LA
## 4.03208863 4.24100842
#sorting by PC1?
##Biplot for first two components
biplot(state.pca, xlim = c(-0.4, 0.6))
biplot(state.pca, choices = c(1,3))
##Boston Cluster Analysis
boston = read.csv("../datafiles/boston6k.csv")
boston.loc = cbind(boston$LON, boston$LAT)
boston1 = boston[ , 9:21]
##Scale the data
boston1.s = scale(boston1)
##Run the loop
source("cluster_boston_script.r")
##Results
plot(1:20, sil.out2, type = "b",
lwd = 2,
xlab = "n Groups",
ylab = "Index Score",
main = "Average Silhoutte Score")
##Plotting 2 versus 6 clusters
ngrp2 = 2
ngrp6 = 6
mycol2 = brewer.pal(ngrp6, name = "Accent") #Experimenting with different color palette
boston.kmeans2 = kmeans(boston1.s, ngrp2, nstart = 50, iter.max = 20) ##Two groups
boston.kmeans6 = kmeans(boston1.s, ngrp6, nstart = 50, iter.max = 20) ##Six groups
par(mfrow = c(1,2))
plot(boston.loc,
xlab = "Longitude",
ylab = "Latitude",
main = "Boston Housing Clusters (2)",
pch = 16,
col = alpha(mycol2[boston.kmeans2$cluster], 0.5))
map(add =TRUE)
plot(boston.loc,
xlab = "Longitude",
ylab = "Latitude",
main = "Boston Housing Clusters (6)",
pch = 16,
col = alpha(mycol2[boston.kmeans6$cluster], 0.5))
map(add =TRUE)
##Output from the previous set of code used to help visualize the difference between two and 6 clusters
boston.kmeans6
## K-means clustering with 6 clusters of sizes 34, 50, 35, 83, 89, 215
##
## Cluster means:
## CRIM ZN INDUS CHAS NOX RM AGE
## 1 -0.1985497 -0.2602436 0.2799956 3.6647712 0.3830784 0.27566811 0.3721322
## 2 -0.2807410 -0.4872402 1.5745577 -0.2723291 1.0086925 -0.48725596 0.9430257
## 3 1.4802645 -0.4872402 1.0149946 -0.2723291 0.9676887 -0.29693894 0.7656016
## 4 -0.4124374 1.9050974 -1.1029195 -0.2248941 -1.1420321 0.63738196 -1.3699186
## 5 0.9722616 -0.4872402 1.0149946 -0.2723291 1.0010692 -0.47909897 0.7460871
## 6 -0.3875372 -0.2999768 -0.5700686 -0.2723291 -0.4262088 0.07032582 -0.1827830
## DIS RAD TAX PTRATIO B LSTAT
## 1 -0.4033382 0.001081444 -0.09756330 -0.39245849 0.17154271 -0.1643525
## 2 -0.8946961 -0.616658684 0.03359999 -0.30002307 -0.07381173 0.5971853
## 3 -0.8580043 1.659602895 1.52941294 0.80577843 -3.29705640 1.1699052
## 4 1.5217901 -0.615191967 -0.60132504 -0.72908309 0.35337310 -0.9051745
## 5 -0.8009959 1.659602895 1.52941294 0.80577843 0.16590165 0.8119640
## 6 0.1556210 -0.576435631 -0.64232592 -0.05143203 0.32167393 -0.2900152
##
## Clustering vector:
## [1] 1 1 1 5 5 5 5 1 1 5 5 3 5 1 1 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [38] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 3 3 3 3 3 3 3 3 3 3 5 5 5 3 3 3 3 3 3 3
## [75] 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 3 5 5 5 5 3 5 5 5 3 3 3 3 5 5 5 5 5 5 5 5 3
## [112] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 6
## [149] 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [186] 6 6 6 6 4 4 6 6 6 6 6 6 6 6 6 6 6 4 4 4 4 4 4 4 6 6 6 6 4 4 4 4 6 6 6 6 6
## [223] 6 6 6 6 6 6 6 6 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [260] 6 6 6 6 6 6 6 6 6 6 6 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2
## [297] 2 2 2 2 2 2 1 2 1 1 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 2 6 6 6 6 6 6 6 6 6 6 6
## [334] 6 6 6 6 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 1 1 1 1 1 6 6 6 1 6 1 1
## [371] 1 1 1 6 6 6 6 6 6 6 6 6 6 6 1 6 1 6 4 4 4 6 4 4 6 6 4 6 4 4 4 4 4 4 4 4 4
## [408] 6 6 6 6 6 6 6 6 6 6 6 6 1 6 6 6 1 1 4 1 1 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 6
## [445] 6 6 6 6 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
## [482] 4 4 6 6 6 6 6 6 6 6 4 6 4 4 6 6 4 4 4 4 4 4 4 4 4
##
## Within cluster sum of squares by cluster:
## [1] 290.2630 290.5607 183.3608 338.7875 419.3128 909.5003
## (between_SS / total_SS = 63.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
##Convert the back to the unscaled data by cluster, then calculate the mean value by cluster. Included the original first few columns as well (primarily for median price columns)
boston.kmeans6.centers = aggregate(boston, list(boston.kmeans6$cluster), mean)
library(kableExtra)
boston.kmeans6.centers %>%
kbl() %>%
kable_classic() %>%
scroll_box(width = "500px", height = "200px")
Group.1 | ID | TOWN | TOWNNO | TRACT | LON | LAT | MEDV | CMEDV | CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 281.47059 | NA | 46.52941 | 2875.9412 | -71.17256 | 42.34603 | 27.80588 | 27.80588 | 1.9056897 | 5.294118 | 13.057647 | 1.0000000 | 0.5990853 | 6.478324 | 79.05000 | 2.945729 | 9.558824 | 391.7941 | 17.60588 | 372.33500 | 11.479412 |
2 | 278.96000 | NA | 33.52000 | 3318.6000 | -71.09546 | 42.38783 | 19.55200 | 19.55200 | 1.1987174 | 0.000000 | 21.938800 | 0.0000000 | 0.6715800 | 5.942280 | 95.12000 | 1.911072 | 4.180000 | 413.9000 | 17.80600 | 349.93540 | 16.917600 |
3 | 73.34286 | NA | 82.11429 | 821.4571 | -71.08327 | 42.31907 | 12.90000 | 12.87143 | 16.3460857 | 0.000000 | 18.100000 | 0.0000000 | 0.6668286 | 6.076000 | 90.12571 | 1.988334 | 24.000000 | 666.0000 | 20.20000 | 55.66971 | 21.007429 |
4 | 383.07229 | NA | 42.21687 | 3789.3012 | -71.16298 | 42.31999 | 29.02530 | 29.03735 | 0.0659249 | 55.795181 | 3.570361 | 0.0120482 | 0.4223590 | 6.732470 | 30.01325 | 6.999492 | 4.192771 | 306.8916 | 16.87711 | 388.93518 | 6.189157 |
5 | 69.01124 | NA | 81.56180 | 761.6404 | -71.07796 | 42.32645 | 16.43708 | 16.39663 | 11.9764754 | 0.000000 | 18.100000 | 0.0000000 | 0.6706966 | 5.948011 | 89.57640 | 2.108378 | 24.000000 | 666.0000 | 20.20000 | 371.82000 | 18.451348 |
6 | 298.83256 | NA | 33.28372 | 3216.8326 | -71.06380 | 42.39237 | 23.97721 | 23.98465 | 0.2801047 | 4.367442 | 7.225907 | 0.0000000 | 0.5053070 | 6.334047 | 63.42977 | 4.122735 | 4.530233 | 299.9814 | 18.34419 | 386.04121 | 10.582047 |
value = cbind(boston.kmeans6.centers$Group.1, boston.kmeans6.centers$CMEDV)
value2 = as.data.frame(value)
colnames(value2) = c("Cluster", "Mean Corrected House Value")
library(kableExtra)
value2 %>%
kbl(caption = "House Value by Cluster") %>%
kable_classic( full_width = F)
Cluster | Mean Corrected House Value |
---|---|
1 | 27.80588 |
2 | 19.55200 |
3 | 12.87143 |
4 | 29.03735 |
5 | 16.39663 |
6 | 23.98465 |
##Setting up a new data frame for the data the includes the cluster number
boston.clus.df = data.frame(cluster = boston.kmeans6$cluster, boston)
##Change the cluster ID to a character
boston.clus.df$cluster = as.character(boston.clus.df$cluster)
clust.aov = aov(CMEDV ~ cluster, data = boston.clus.df)
clust.aov.sum = summary(clust.aov)
print.sum = print(clust.aov.sum)
## Df Sum Sq Mean Sq F value Pr(>F)
## cluster 5 11973 2394.5 39.12 <2e-16 ***
## Residuals 500 30605 61.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print.sum
## Df Sum Sq Mean Sq F value Pr(>F)
## cluster 5 11973 2394.5 39.12 <2e-16 ***
## Residuals 500 30605 61.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##Read in a new set of data to work with
wna.pca.data = read.csv("../datafiles/wnaclim2.csv")
##Use only the monthly temp and precip data
wnaclim.temp.precip = wna.pca.data[ , 3:26]
##PCA - scale = TRUE syntax scales the data
wnaclim.pca = prcomp(wnaclim.temp.precip, scale = TRUE)
##Biplot
par(mfrow = c(1,2))
wna.biplot = biplot(wnaclim.pca)
##Screeplot
wna.scree = screeplot(wnaclim.pca)
summary(wnaclim.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.4612 2.8065 1.5846 0.84733 0.63986 0.51316 0.29261
## Proportion of Variance 0.4992 0.3282 0.1046 0.02992 0.01706 0.01097 0.00357
## Cumulative Proportion 0.4992 0.8274 0.9320 0.96190 0.97896 0.98993 0.99350
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.20707 0.17437 0.15014 0.13534 0.10798 0.09353 0.08322
## Proportion of Variance 0.00179 0.00127 0.00094 0.00076 0.00049 0.00036 0.00029
## Cumulative Proportion 0.99528 0.99655 0.99749 0.99825 0.99874 0.99910 0.99939
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.06116 0.05676 0.04584 0.04161 0.03439 0.02946 0.02562
## Proportion of Variance 0.00016 0.00013 0.00009 0.00007 0.00005 0.00004 0.00003
## Cumulative Proportion 0.99955 0.99968 0.99977 0.99984 0.99989 0.99993 0.99995
## PC22 PC23 PC24
## Standard deviation 0.02411 0.01802 0.01392
## Proportion of Variance 0.00002 0.00001 0.00001
## Cumulative Proportion 0.99998 0.99999 1.00000
## Look at the loadings for PC1 and PC2 (columns 1 and 2)
wnaclim.pca$rotation[ , 1:2]
## PC1 PC2
## tjan 0.27154635 -0.07007925
## tfeb 0.27524689 -0.07830351
## tmar 0.27484658 -0.09367701
## tapr 0.26473010 -0.12257641
## tmay 0.23877755 -0.16889072
## tjun 0.20220435 -0.21701738
## tjul 0.20390908 -0.22851811
## taug 0.22863878 -0.20271341
## tsep 0.25186014 -0.16881584
## toct 0.26714245 -0.12423680
## tnov 0.27214789 -0.09552584
## tdec 0.27301687 -0.07380880
## pjan 0.16803736 0.25995212
## pfeb 0.17102216 0.26393484
## pmar 0.17796187 0.25650454
## papr 0.17074339 0.27824680
## pmay 0.14748106 0.25905950
## pjun 0.05730680 0.23790590
## pjul 0.01369406 0.10097126
## paug 0.03206302 0.14782502
## psep 0.10354548 0.26528778
## poct 0.13939969 0.28759576
## pnov 0.16303542 0.27829605
## pdec 0.16860662 0.26639834
The February and March temperature have high loadings (~0.28 & 0.27 respectively) on principle component 1.
The October and November precipitation have high loadings (~0.29 & 0.28 respectively) on principle component 2.
Based on these loadings, we see that principle component 1 is largely a temperature gradient whereas principle component 2 is largely a precipitation gradient. This matches what we see in our ordination plot as well.
##PC1
##Site scores plot
##Set the score as a variable
wnaclim.pca.score = wnaclim.pca$x[ , 1]
##Map the score to location (long/lat)
quilt.plot(wna.pca.data$Longitude, wna.pca.data$Latitude, wnaclim.pca.score)
world(add = TRUE)
##PC2
##Set the score as a variable
wnaclim.pca.score2 = wnaclim.pca$x[ , 2]
##Map the score to location (long/lat)
quilt.plot(wna.pca.data$Longitude, wna.pca.data$Latitude, wnaclim.pca.score2)
world(add = TRUE)