Problem 1:

The data ‘crime’ gives rates of different types of crime per 100,000 residents of the 50 states of the USA for the year 1986.

#Loading libraries.
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## 
## Attaching package: 'dendextend'
## 
## The following object is masked from 'package:stats':
## 
##     cutree
library(dendsort)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
library(vegan)
## Loading required package: permute
## 
## Attaching package: 'permute'
## 
## The following object is masked from 'package:dendextend':
## 
##     shuffle
## 
## Loading required package: lattice
## Registered S3 method overwritten by 'vegan':
##   method     from      
##   rev.hclust dendextend
#Loading the data.
crime <- read.csv('/Users/michaelcajigal/Desktop/MA564/Homework 4/crime.csv')
head(crime)
##   Murder Rape Robbery Assault Burglary Theft Vehicle State
## 1    2.0 14.8      28     102      803  2347     164    ME
## 2    2.2 21.5      24      92      755  2208     228    NH
## 3    2.0 21.8      22     103      949  2697     181    VT
## 4    3.6 29.7     193     331     1071  2189     906    MA
## 5    3.5 21.4     119     192     1294  2568     705    RI
## 6    4.6 23.8     192     205     1198  2758     447    CT
summary(crime)
##      Murder            Rape          Robbery         Assault     
##  Min.   : 1.000   Min.   :11.60   Min.   :  7.0   Min.   : 32.0  
##  1st Qu.: 3.700   1st Qu.:23.27   1st Qu.: 67.0   1st Qu.:176.5  
##  Median : 6.300   Median :30.10   Median :109.5   Median :248.0  
##  Mean   : 6.776   Mean   :33.85   Mean   :142.1   Mean   :275.7  
##  3rd Qu.: 9.275   3rd Qu.:43.45   3rd Qu.:202.8   3rd Qu.:366.0  
##  Max.   :13.500   Max.   :72.70   Max.   :514.0   Max.   :605.0  
##     Burglary        Theft         Vehicle         State          
##  Min.   : 385   Min.   :1358   Min.   : 99.0   Length:50         
##  1st Qu.: 894   1st Qu.:2366   1st Qu.:209.8   Class :character  
##  Median :1148   Median :2812   Median :328.0   Mode  :character  
##  Mean   :1197   Mean   :2918   Mean   :382.2                     
##  3rd Qu.:1425   3rd Qu.:3382   3rd Qu.:529.5                     
##  Max.   :2221   Max.   :4373   Max.   :906.0
#Checking for any missing data because PCA interpretation changes.
colSums(is.na(crime))
##   Murder     Rape  Robbery  Assault Burglary    Theft  Vehicle    State 
##        0        0        0        0        0        0        0        0
#Remove first 3 columns 
crime_data <- crime %>% select(-State)
head(crime_data)
##   Murder Rape Robbery Assault Burglary Theft Vehicle
## 1    2.0 14.8      28     102      803  2347     164
## 2    2.2 21.5      24      92      755  2208     228
## 3    2.0 21.8      22     103      949  2697     181
## 4    3.6 29.7     193     331     1071  2189     906
## 5    3.5 21.4     119     192     1294  2568     705
## 6    4.6 23.8     192     205     1198  2758     447

*Note: Since there are no entries missing, all data is preserved and will be used for further analysis.

  1. Run hierarchical cluster using complete and ward methods. Create dendrograms. Did you get similar clusters?
###############   Hierarchical Clustering   #############
#################    Complete Linkage   #################

#1.Compute distance matrix.
#The dist() function uses Euclidean distance by default.
dist_matrix <- dist(crime_data)

#2.Perform hierarchical clustering using complete linkage.
hc_complete <- hclust(dist_matrix, method = "complete")

#3.Plot the dendrogram, dendsort function orders dendodram.
dend_complete <- dendsort(as.dendrogram(hc_complete))
plot(dend_complete)

#4.Plot colored Dendrogram with 4 clusters.
#Cut the dendrogram to get 4 clusters.
clusters_complete <- cutree(hc_complete, k = 4)
table(clusters_complete)
## clusters_complete
##  1  2  3  4 
## 27  6 11  6
#Color branches by clusters.
dend_colored_complete <- color_branches(dend_complete, k = 4)%>%
  set("branches_lwd", 2)

# Plot the colored dendrogram
plot(dend_colored_complete,
     main = "Dendrogram - Complete Linkage (4 Clusters)",
     ylab = "Height (Distance)")

###############   Hierarchical Clustering   ################
#################       Ward Linkage       #################

#1.Perform hierarchical clustering using ward linkage, 
#when we use method = "ward.D2" in hclust(), 
#R automatically squares the Euclidean distances internally.
hc_ward <- hclust(dist_matrix, method = "ward.D2")

clusters_ward <- cutree(hc_ward, k = 4)
dend <- as.dendrogram(hc_ward)
dend_colored <- color_branches(dend, 4)%>%
  set("branches_lwd", 2)

plot(dend_colored,
     main = "Dendrogram - Ward Linkage (4 Clusters)",
     ylab = "Height (Distance)")

  1. Use elbow method to determine the number of clusters for K-means cluster method. Run K-means cluster analysis.
################    Elbow Method   ################   
#1.Scree plot.
fviz_nbclust(crime_data, kmeans, method = "wss", k.max=12)

#2.Run K-means clustering with k = 2.
set.seed(123)  # for reproducibility
kmeans_result <- kmeans(crime_data, centers = 2, nstart = 100)
# Optional: Add cluster labels to your original data
crime_data$cluster <- kmeans_result$cluster

#3.Plot results of final k-means model.
fviz_cluster(kmeans_result, data = crime_data)

  1. Find a range of values and variances for all the variables in your data. What do you notice about the scale and variability of the variables.

    summary_stats <- data.frame(
      Min = sapply(crime_data, min),
      Max = sapply(crime_data, max),
      Range = sapply(crime_data, function(x) max(x) - min(x)),
      Variance = sapply(crime_data, var)
    )
    print(summary_stats)
    ##             Min    Max  Range     Variance
    ## Murder      1.0   13.5   12.5 1.193492e+01
    ## Rape       11.6   72.7   61.1 2.097634e+02
    ## Robbery     7.0  514.0  507.0 1.188956e+04
    ## Assault    32.0  605.0  573.0 1.937354e+04
    ## Burglary  385.0 2221.0 1836.0 1.758950e+05
    ## Theft    1358.0 4373.0 3015.0 5.652766e+05
    ## Vehicle    99.0  906.0  807.0 4.399736e+04
    ## cluster     1.0    2.0    1.0 2.142857e-01
    • The scale and variability of all the variables are very different (i.e. Murder only has a range of 12.5 and a vraiance of 11.9 while Theft has a range of 3015 and a variance of 565276.6).
  2. Standardize the data and run a) and b) again. Compare your results.

    #Scaling data since variables seem to have different scales.
    crime_scaled <- scale(crime_data)
    
    #Redoing a) with standarized data.
    
    ###############   Hierarchical Clustering   #############
    #################    Complete Linkage   #################
    
    #1.Compute distance matrix.
    #The dist() function uses Euclidean distance by default.
    dist_matrix <- dist(crime_scaled)
    
    #2.Perform hierarchical clustering using complete linkage.
    hc_complete <- hclust(dist_matrix, method = "complete")
    
    #3.Plot the dendrogram, dendsort function orders dendodram.
    dend_complete <- dendsort(as.dendrogram(hc_complete))
    plot(dend_complete)

    #4.Plot colored Dendrogram with 3 clusters.
    #Cut the dendrogram to get 3 clusters.
    clusters_complete <- cutree(hc_complete, k = 3)
    table(clusters_complete)
    ## clusters_complete
    ##  1  2  3 
    ## 12 30  8
    #Color branches by clusters.
    dend_colored_complete <- color_branches(dend_complete, k = 3)%>%
      set("branches_lwd", 2)
    
    # Plot the colored dendrogram
    plot(dend_colored_complete,
         main = "Dendrogram - Complete Linkage (3 Clusters)",
         ylab = "Height (Distance)")

    ###############   Hierarchical Clustering   ################
    #################       Ward Linkage       #################
    
    #1.Perform hierarchical clustering using ward linkage, 
    #when we use method = "ward.D2" in hclust(), 
    #R automatically squares the Euclidean distances internally.
    hc_ward <- hclust(dist_matrix, method = "ward.D2")
    
    clusters_ward <- cutree(hc_ward, k = 4)
    dend <- as.dendrogram(hc_ward)
    dend_colored <- color_branches(dend, 4)%>%
      set("branches_lwd", 2)
    
    plot(dend_colored,
         main = "Dendrogram - Ward Linkage (4 Clusters)",
         ylab = "Height (Distance)")

    #Redoing b) with standarized data.
    ################    Elbow Method   ################   
    #1.Scree plot.
    fviz_nbclust(crime_scaled, kmeans, method = "wss", k.max=12)

    #2.Run K-means clustering with k = 3.
    set.seed(456)  # for reproducibility
    kmeans_result <- kmeans(crime_scaled, centers = 3, nstart = 100)
    # Optional: Add cluster labels to your original data
    crime_data$cluster <- kmeans_result$cluster
    
    #3.Plot results of final k-means model.
    fviz_cluster(kmeans_result, data = crime_scaled)

  1. Which method performed best and why?
  1. Describe your clusters by comparing the means of different crime rates across clusters.

    crime_scaled <- as.data.frame(crime_scaled)
    
    #For hierarchial-complete linkage.
    crime_scaled$Cluster <- as.factor(clusters_complete)  
    crime_vars <- c("Murder", "Rape", "Robbery", "Assault", "Burglary", "Theft", "Vehicle")  # replace with your variable names
    aggregate(crime_scaled[crime_vars], by = list(cluster = crime_scaled$Cluster), FUN = mean)
    ##   cluster      Murder        Rape    Robbery     Assault    Burglary
    ## 1       1 -1.08094421 -1.05609650 -1.0227203 -1.03511646 -1.03301145
    ## 2       2  0.05519058  0.03033398  0.1513216  0.06111601  0.09737758
    ## 3       3  1.41445164  1.47039234  0.9666243  1.32348966  1.18435125
    ##         Theft    Vehicle
    ## 1 -0.54844363 -0.9855285
    ## 2 -0.06013621  0.1267190
    ## 3  1.04817623  1.0030966
    #For hierarchial-ward.
    crime_scaled$Cluster <- as.factor(clusters_ward)  
    crime_vars <- c("Murder", "Rape", "Robbery", "Assault", "Burglary", "Theft", "Vehicle")  # replace with your variable names
    aggregate(crime_scaled[crime_vars], by = list(cluster = crime_scaled$Cluster), FUN = mean)
    ##   cluster      Murder        Rape    Robbery     Assault    Burglary      Theft
    ## 1       1 -1.04131561 -0.96148777 -0.9547020 -1.02493844 -0.90916625 -0.2804059
    ## 2       2 -0.07989126 -0.22072851  1.2070357  0.41503736 -0.01553927 -0.2375592
    ## 3       3  0.20052417 -0.04299807 -0.2055452 -0.03752097 -0.28635661 -0.6360887
    ## 4       4  0.91763620  1.20722287  0.6311770  0.92647786  1.33990832  1.2127705
    ##      Vehicle
    ## 1 -0.9174220
    ## 2  1.3406495
    ## 3 -0.3925762
    ## 4  0.7492754
    #For k-means.
    crime_scaled$Cluster <- as.factor(kmeans_result$cluster)  
    crime_vars <- c("Murder", "Rape", "Robbery", "Assault", "Burglary", "Theft", "Vehicle")  # replace with your variable names
    aggregate(crime_scaled[crime_vars], by = list(cluster = crime_scaled$Cluster), FUN = mean)
    ##   cluster     Murder        Rape    Robbery    Assault     Burglary      Theft
    ## 1       1 -0.6982854 -0.77138855 -0.7654457 -0.8495717 -0.797085483 -0.4900018
    ## 2       2  0.2288672  0.08511005  0.5756336  0.4430911  0.007804831 -0.3323984
    ## 3       3  0.9176362  1.20722287  0.6311770  0.9264779  1.339908321  1.2127705
    ##      Vehicle
    ## 1 -0.8162838
    ## 2  0.5478443
    ## 3  0.7492754

Problem 2:

The dataset SVI Village contains SVI variables.

#Loading the data.
svi <- read.csv('/Users/michaelcajigal/Desktop/MA564/Homework 4/SVI Village.csv')
head(svi)
##         Village     A1_P      A2_P  A3_P     A4_P     B1_P     B2_P      B3_P
## 1 Agana Heights 17.93592  6.975415 19276 17.13636 8.219538 30.82983  9.637605
## 2          Agat 26.54057 13.676093 14749 23.29920 8.867195 34.26886 11.714460
## 3          Asan 18.71783  7.788162 21626 16.65326 8.469818 31.77351  8.469818
## 4     Barrigada 20.13521  6.554071 19279 17.84249 8.078873 31.97746  7.864789
## 5        Dededo 24.66013  8.402469 13550 24.11425 7.235832 34.07427  7.411610
## 6       Hagatna 27.21218  9.619687 16093 28.55093 5.328259 24.07231  9.229305
##       B4_P       C2_P       D1_P      D2_P     D3_P      D4_P        D5_P
## 1 14.78102 0.14232849 14.6708961 0.4758128 16.69708  7.025547  1.52310924
## 2 16.91819 0.04493372  7.2281167 1.8567639 25.41700  8.895949  2.11511084
## 3 14.08451 0.20618557  0.7989348 0.3994674 14.86698  4.851330  0.04679457
## 4 15.35730 0.52832043  5.9622642 1.3962264 19.35198  4.837994  3.59436620
## 5 15.26115 0.34968455  5.6980279 0.9977395 28.39137  6.945956  1.18817168
## 6 17.13287 2.05973223 29.5454545 0.2525253 22.72727 12.937063 21.97906755
##        H1_P      H2_P    H3A_P     H3B_P     H3C_P      H4_P      H5_P     H6_P
## 1 0.2379064 12.529738 2.616971  5.551150 0.7137193  9.992070  4.916733 3.832117
## 2 0.6631300 13.859416 9.748011 17.241379 2.3209549 14.257294 13.925729 3.733122
## 3 0.0000000 36.484687 3.595206  8.122503 1.0652463  7.589880  6.524634 2.660407
## 4 0.3773585 41.509434 4.528302  8.679245 1.4716981 11.509434  9.962264 2.796272
## 5 0.9665601 24.156209 8.207966 14.958298 1.8941461 13.703328 11.894926 4.171201
## 6 0.0000000  8.333333 2.525253  6.060606 1.5151515  7.828283 20.202020 7.342657
##       H7_P     H8_P      H9_P      P1_P     P2_P     P3_P     P4_P
## 1 19.98175 23.99635  7.299270  9.952731 15.25735 25.63869 13.54560
## 2 23.74901 34.07466  7.863384 11.673785 21.51719 30.73868 15.78296
## 3 18.30986 19.87480  5.790297  6.972391 12.44736 25.19562 13.14149
## 4 22.37017 21.79316  6.080781 14.715493 25.43099 28.85042 18.94785
## 5 26.71382 27.29416  6.909684 21.195737 40.50241 38.25716 24.01518
## 6 32.86713 37.06294 10.839161 30.827783 35.39486 32.16783 28.65169
summary(svi)
##    Village               A1_P            A2_P             A3_P      
##  Length:19          Min.   :12.85   Min.   : 5.670   Min.   :13546  
##  Class :character   1st Qu.:18.33   1st Qu.: 7.670   1st Qu.:15164  
##  Mode  :character   Median :21.08   Median : 8.397   Median :17882  
##                     Mean   :21.64   Mean   : 9.091   Mean   :17675  
##                     3rd Qu.:24.55   3rd Qu.: 9.720   3rd Qu.:19292  
##                     Max.   :29.70   Max.   :15.576   Max.   :26303  
##       A4_P            B1_P            B2_P            B3_P       
##  Min.   :12.49   Min.   :4.220   Min.   :24.07   Min.   : 5.916  
##  1st Qu.:17.49   1st Qu.:6.050   1st Qu.:31.17   1st Qu.: 7.460  
##  Median :19.66   Median :6.523   Median :33.19   Median : 8.077  
##  Mean   :19.79   Mean   :6.875   Mean   :32.90   Mean   : 8.562  
##  3rd Qu.:21.93   3rd Qu.:7.994   3rd Qu.:35.14   3rd Qu.: 9.614  
##  Max.   :28.55   Max.   :9.452   Max.   :39.51   Max.   :12.270  
##       B4_P             C2_P             D1_P             D2_P       
##  Min.   : 9.888   Min.   :0.0000   Min.   : 0.000   Min.   :0.0000  
##  1st Qu.:13.968   1st Qu.:0.0733   1st Qu.: 1.047   1st Qu.:0.3691  
##  Median :15.426   Median :0.2062   Median : 5.962   Median :0.4758  
##  Mean   :15.736   Mean   :0.3988   Mean   :12.034   Mean   :0.6874  
##  3rd Qu.:17.318   3rd Qu.:0.4362   3rd Qu.:17.974   3rd Qu.:0.9931  
##  Max.   :21.637   Max.   :2.0597   Max.   :54.445   Max.   :1.8568  
##       D3_P            D4_P             D5_P              H1_P        
##  Min.   :13.03   Min.   : 1.124   Min.   : 0.0000   Min.   :0.00000  
##  1st Qu.:16.59   1st Qu.: 4.845   1st Qu.: 0.8245   1st Qu.:0.04353  
##  Median :20.64   Median : 6.247   Median : 1.5231   Median :0.42634  
##  Mean   :21.63   Mean   : 6.594   Mean   : 3.5976   Mean   :0.61984  
##  3rd Qu.:26.28   3rd Qu.: 8.516   3rd Qu.: 3.5305   3rd Qu.:0.71976  
##  Max.   :32.75   Max.   :12.937   Max.   :21.9791   Max.   :3.14136  
##       H2_P            H3A_P            H3B_P            H3C_P       
##  Min.   : 4.655   Min.   : 1.297   Min.   : 3.990   Min.   :0.4544  
##  1st Qu.:14.373   1st Qu.: 3.321   1st Qu.: 7.392   1st Qu.:1.0304  
##  Median :31.533   Median : 6.056   Median : 9.107   Median :1.5226  
##  Mean   :32.527   Mean   : 6.652   Mean   :11.345   Mean   :1.7611  
##  3rd Qu.:41.456   3rd Qu.: 8.801   3rd Qu.:16.100   3rd Qu.:2.2183  
##  Max.   :81.549   Max.   :17.801   Max.   :21.746   Max.   :3.9539  
##       H4_P             H5_P             H6_P            H7_P      
##  Min.   : 2.926   Min.   : 4.644   Min.   :1.865   Min.   :15.36  
##  1st Qu.: 9.610   1st Qu.: 7.381   1st Qu.:3.322   1st Qu.:18.79  
##  Median :11.668   Median : 9.962   Median :3.888   Median :22.37  
##  Mean   :12.165   Mean   :11.510   Mean   :4.153   Mean   :22.37  
##  3rd Qu.:14.406   3rd Qu.:12.910   3rd Qu.:4.333   3rd Qu.:24.83  
##  Max.   :25.865   Max.   :25.041   Max.   :8.187   Max.   :32.87  
##       H8_P            H9_P             P1_P             P2_P       
##  Min.   :15.47   Min.   : 4.494   Min.   : 3.080   Min.   : 4.987  
##  1st Qu.:21.18   1st Qu.: 5.945   1st Qu.: 6.844   1st Qu.:12.096  
##  Median :23.40   Median : 7.042   Median : 9.953   Median :15.895  
##  Mean   :25.39   Mean   : 7.065   Mean   :12.868   Mean   :20.406  
##  3rd Qu.:30.05   3rd Qu.: 7.738   3rd Qu.:17.949   3rd Qu.:26.897  
##  Max.   :39.77   Max.   :10.839   Max.   :34.681   Max.   :50.597  
##       P3_P            P4_P      
##  Min.   :20.47   Min.   :10.53  
##  1st Qu.:24.47   1st Qu.:13.46  
##  Median :26.89   Median :15.18  
##  Mean   :27.69   Mean   :17.19  
##  3rd Qu.:30.76   3rd Qu.:20.24  
##  Max.   :38.26   Max.   :30.34
#Checking for any missing data because PCA interpretation changes.
colSums(is.na(svi))
## Village    A1_P    A2_P    A3_P    A4_P    B1_P    B2_P    B3_P    B4_P    C2_P 
##       0       0       0       0       0       0       0       0       0       0 
##    D1_P    D2_P    D3_P    D4_P    D5_P    H1_P    H2_P   H3A_P   H3B_P   H3C_P 
##       0       0       0       0       0       0       0       0       0       0 
##    H4_P    H5_P    H6_P    H7_P    H8_P    H9_P    P1_P    P2_P    P3_P    P4_P 
##       0       0       0       0       0       0       0       0       0       0
  1. Standardize all SVI variables (mean = 0, standard deviation = 1).
svi[, 2:30] <- scale(svi[, 2:30])
  1. Run PCA on the standardized data.
#Run PCA on standardized SVI data but 
pca_result <- prcomp(svi[, -1], center = FALSE, scale. = FALSE)
summary(pca_result)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     3.2965 2.7998 1.7995 1.3835 1.27771 1.00980 0.9011
## Proportion of Variance 0.3747 0.2703 0.1117 0.0660 0.05629 0.03516 0.0280
## Cumulative Proportion  0.3747 0.6450 0.7567 0.8227 0.87899 0.91415 0.9422
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.66524 0.57581 0.53093 0.45027 0.34801 0.28273 0.24476
## Proportion of Variance 0.01526 0.01143 0.00972 0.00699 0.00418 0.00276 0.00207
## Cumulative Proportion  0.95741 0.96884 0.97856 0.98555 0.98973 0.99249 0.99455
##                           PC15    PC16    PC17    PC18      PC19
## Standard deviation     0.23343 0.22490 0.17414 0.15031 2.292e-15
## Proportion of Variance 0.00188 0.00174 0.00105 0.00078 0.000e+00
## Cumulative Proportion  0.99643 0.99818 0.99922 1.00000 1.000e+00
plot(pca_result, type = "l", main = "Scree Plot")

pca_result$rotation
##                PC1         PC2          PC3         PC4         PC5
## A1_P  -0.110405193  0.28157055  0.073684340 -0.05319899  0.26446841
## A2_P  -0.229807042  0.10300136  0.178505407 -0.01630685  0.20130082
## A3_P   0.219161395 -0.18557262  0.068673894  0.14218256 -0.02987983
## A4_P  -0.141924138  0.27367590 -0.007639570 -0.18454128 -0.19831300
## B1_P   0.100201072 -0.11953639  0.206041487 -0.52515533 -0.04009545
## B2_P  -0.239082347 -0.12211749 -0.006877935 -0.11093227  0.20912763
## B3_P  -0.158708062  0.04690335  0.365135445 -0.27647077 -0.10529670
## B4_P  -0.228864135  0.13019809  0.187546520  0.04027536  0.02594598
## C2_P   0.122786988  0.21814550 -0.110841121  0.19109249 -0.34503302
## D1_P   0.198449376  0.17502385  0.047133280  0.07784710  0.13667398
## D2_P  -0.049532102 -0.03904538 -0.304048632 -0.46836846 -0.15568035
## D3_P  -0.258309030  0.13777453 -0.151663081 -0.05417840  0.05779719
## D4_P  -0.006723012  0.30960268  0.241771221 -0.02441526  0.04737796
## D5_P   0.113090213  0.18011196 -0.049350581  0.25687371 -0.32077189
## H1_P  -0.242927248  0.02229721 -0.185117429  0.16906871  0.22356366
## H2_P  -0.152417706 -0.15907636 -0.177257575  0.12701477 -0.33086042
## H3A_P -0.279567445 -0.02238068 -0.142333271  0.06666404  0.13007966
## H3B_P -0.261679273 -0.03233865 -0.225928248 -0.07668115  0.02512619
## H3C_P -0.233041371 -0.03137547 -0.264667484  0.05720483 -0.03768310
## H4_P  -0.256934234 -0.09032621 -0.137701055 -0.06313781 -0.20496025
## H5_P  -0.240044194  0.11266112 -0.032156523  0.04739911 -0.30918088
## H6_P  -0.175958833  0.24501013  0.090784081  0.24550515  0.04820461
## H7_P   0.123819533  0.30251464 -0.078304641 -0.07605460  0.06605872
## H8_P  -0.190562968  0.24469159  0.107955319 -0.00360906  0.02319528
## H9_P  -0.059126946  0.21728409  0.213927800 -0.14173910 -0.41153476
## P1_P   0.168430703  0.26889143 -0.177552675 -0.02974207  0.05174605
## P2_P   0.165456355  0.22673702 -0.256498642 -0.13659272  0.14032002
## P3_P   0.142321750  0.11972796 -0.339394803 -0.27210814  0.05631510
## P4_P   0.129075480  0.28201993 -0.211558299 -0.02697543  0.06016054
##                PC6          PC7          PC8          PC9          PC10
## A1_P  -0.110937190 -0.034572780  0.378204647 -0.081878184  0.0114244633
## A2_P   0.193218965  0.296513027  0.256199409  0.029642873 -0.1060430290
## A3_P   0.182494948 -0.277691939  0.218927838  0.079335960 -0.0234923160
## A4_P  -0.070570942  0.084618242 -0.009594626 -0.368690769 -0.0007068658
## B1_P   0.200161092 -0.187313499  0.039887043 -0.092704563  0.2736457218
## B2_P  -0.278689452  0.059144001  0.154275761 -0.378944970 -0.1814515136
## B3_P   0.267027427 -0.121485218  0.074787287  0.059683421 -0.0608621664
## B4_P  -0.317250394  0.133479971 -0.138191598  0.235500310 -0.0633628047
## C2_P   0.051778062 -0.021935334  0.469504284 -0.178864813  0.3675666262
## D1_P   0.024616275 -0.469540774  0.094859702  0.165875100 -0.5371144431
## D2_P  -0.098214645  0.172996710  0.409142912  0.502564590 -0.0685312090
## D3_P  -0.008196978 -0.091798491 -0.098872034 -0.072131389 -0.0622036158
## D4_P  -0.026161466  0.131741841 -0.108349890  0.188093642  0.1669066046
## D5_P   0.309217758  0.468055084 -0.027699143  0.131169901 -0.2443098801
## H1_P   0.054029690 -0.135422536 -0.121256649  0.258521834  0.4378149549
## H2_P  -0.444923012 -0.103794455  0.156400764  0.063002927  0.0130666444
## H3A_P  0.137419804 -0.017546786  0.093670696  0.079649857 -0.0410319047
## H3B_P  0.165229834  0.002100843  0.021988030 -0.072880887 -0.2654572509
## H3C_P  0.298545246 -0.210132897 -0.034577442  0.027152064  0.0085515054
## H4_P   0.035523805 -0.183946161 -0.241576574  0.112953583  0.0559438344
## H5_P   0.207885806 -0.138929839 -0.015577205 -0.162162297 -0.1954362207
## H6_P   0.026877566 -0.104444757  0.054245368 -0.139848731  0.1247425120
## H7_P  -0.102570977 -0.001253024 -0.201112473  0.187579379 -0.0240712481
## H8_P   0.154699994 -0.106813572  0.052087441  0.216670351  0.1452471641
## H9_P  -0.226047101 -0.200243516 -0.203125047  0.028056344 -0.0723197756
## P1_P  -0.031443249 -0.108236354  0.051093983 -0.007096884 -0.0336017988
## P2_P  -0.069319781 -0.020166952 -0.038213814 -0.029303265 -0.0170377092
## P3_P   0.188244492  0.156751352 -0.269009958 -0.203163474  0.0403981277
## P4_P  -0.070140567 -0.184086140  0.045694352 -0.046860375  0.0437928104
##               PC11        PC12        PC13         PC14         PC15
## A1_P  -0.343469595  0.11640143 -0.10143615  0.168875739  0.120718919
## A2_P   0.121568054 -0.34979800 -0.33026684  0.132230610  0.006640825
## A3_P  -0.257717418  0.03574748 -0.29325428  0.213417894 -0.299225581
## A4_P  -0.086717849 -0.09567560  0.02175925  0.001659139 -0.137062813
## B1_P   0.132805240  0.28403092 -0.14590917  0.071517723  0.063753969
## B2_P   0.032025662 -0.16773106  0.03870547  0.275355042 -0.190510253
## B3_P   0.239202459 -0.05017170  0.16041504 -0.259432002 -0.137961263
## B4_P  -0.238122190  0.34119862 -0.08059716 -0.214373517 -0.187506461
## C2_P   0.120526061 -0.18741290  0.06487690 -0.045966416  0.100857025
## D1_P   0.027345133 -0.21052710  0.14568270 -0.008618885  0.084506495
## D2_P  -0.065752792 -0.05538500  0.13696813 -0.085329498 -0.256467616
## D3_P   0.171403355  0.14070856  0.28928388  0.292139484 -0.148854780
## D4_P   0.125192128 -0.08999613 -0.14751401  0.146754961  0.262649615
## D5_P   0.015013636  0.11662147  0.12089049  0.209631247 -0.076150827
## H1_P   0.130633720 -0.13534117  0.01732387  0.082142168 -0.116410300
## H2_P   0.087579159  0.11660295 -0.25234140 -0.004715721  0.230473202
## H3A_P  0.134179644 -0.05037619 -0.07668861 -0.289357950  0.189335075
## H3B_P  0.101456573  0.13740503 -0.08495808 -0.184125125  0.417948065
## H3C_P -0.298454130 -0.10520512 -0.22755587 -0.107789272 -0.033875747
## H4_P   0.039092080 -0.26306276  0.13892100  0.354495946 -0.064786682
## H5_P   0.127561793  0.35466922 -0.28730358  0.138699410 -0.154370234
## H6_P  -0.007482911  0.02286841  0.14778535 -0.421947110 -0.340916042
## H7_P   0.254270270 -0.08182824 -0.49934027  0.023248043 -0.220771141
## H8_P  -0.337624996  0.21275909  0.21697032  0.256974807  0.254099189
## H9_P  -0.194501093 -0.34520884 -0.04711597 -0.055674226  0.150732227
## P1_P   0.105244669  0.07677385  0.08612174 -0.009705724  0.022944345
## P2_P   0.197637024  0.15732937  0.06172915  0.055400486  0.129012828
## P3_P  -0.403269604 -0.16156674 -0.08710221 -0.148275339  0.016240853
## P4_P   0.081557181  0.11879031 -0.07002027 -0.012895247 -0.162548168
##              PC16         PC17        PC18         PC19
## A1_P  -0.09013328  0.017160928 -0.14820916  0.212019975
## A2_P  -0.25278226 -0.097339136  0.02235069 -0.353992249
## A3_P  -0.16955108  0.127988027  0.33776271  0.143758809
## A4_P  -0.09190108 -0.257158446  0.31819278 -0.046467362
## B1_P   0.07920701  0.095259842 -0.18084417 -0.029738921
## B2_P   0.33350751  0.146941426 -0.19116815  0.321516242
## B3_P  -0.24457956 -0.208725398 -0.08039521  0.483819393
## B4_P  -0.11856540 -0.044339796 -0.37570022 -0.138959743
## C2_P   0.02534215  0.058722566 -0.24749633 -0.087690884
## D1_P   0.01711438  0.104429816 -0.16082026 -0.046441800
## D2_P   0.19070119  0.111943199  0.11852586 -0.111595834
## D3_P  -0.27290317 -0.103327652  0.21545347 -0.085910105
## D4_P   0.24387259  0.128724207  0.03404028  0.107775292
## D5_P   0.01875358 -0.002135237 -0.14388034  0.325906054
## H1_P   0.04484245  0.001270653  0.01714284  0.142909120
## H2_P  -0.26980611 -0.039728819  0.07812882  0.333982227
## H3A_P -0.29498258  0.312418437 -0.10797524  0.044443071
## H3B_P  0.19726632  0.013702023  0.29299128  0.032491669
## H3C_P  0.32298818 -0.527171913 -0.22718281  0.048775974
## H4_P  -0.12753865  0.187392220 -0.25390787 -0.044884214
## H5_P   0.13494902  0.285677416 -0.11625324 -0.223676520
## H6_P   0.20893588  0.330112009  0.21327929  0.036620729
## H7_P   0.10679991  0.013174161  0.13260494  0.211281617
## H8_P   0.03074438  0.053433449  0.21475029  0.033689383
## H9_P   0.05555000  0.060425653  0.08457116 -0.079043522
## P1_P   0.03271090 -0.190882559 -0.04523574  0.071460153
## P2_P   0.02100106  0.014625502 -0.04108712  0.007259177
## P3_P  -0.27374242  0.331299064 -0.08933812  0.178903709
## P4_P  -0.22963243 -0.145981789 -0.11974727 -0.147646757
pca_scores <- pca_result$x
print(pca_scores)
##               PC1        PC2        PC3         PC4          PC5         PC6
##  [1,]  2.01626140 -1.3117972  2.0569104 -0.40524385  0.144611858  0.11690715
##  [2,] -2.45751579  1.3264795  0.3938806 -3.52814620  0.333926627  1.27482597
##  [3,]  1.92988702 -2.9253588  1.3132288 -0.06751008  0.105725452 -0.03899067
##  [4,]  1.54145840 -1.1205420 -1.0272840 -1.04083079 -0.535039773 -0.44528663
##  [5,] -0.31404345  2.1071456 -2.5730746 -1.89342712  0.916190493 -0.19367902
##  [6,]  1.85234241  7.8355419  0.5946449  1.38272618 -2.708595248  1.13057879
##  [7,] -5.54982765 -1.7756187 -1.4157004  0.14478113 -2.451531340 -0.01532043
##  [8,]  0.05655776  1.0779630 -1.0769165  0.25258515  0.486694908 -1.10522613
##  [9,] -4.86794927 -0.1297912  2.1373147 -0.53268855 -0.004339196  0.77405105
## [10,]  0.50033182  2.9181434  2.4175775  0.25074315  1.294257581 -2.04570507
## [11,] -0.01132788 -1.4798246 -0.2957122  0.12945381 -0.676283680 -1.04351009
## [12,]  3.40965672 -4.2661756 -0.3879596  0.52310410  0.946097671  1.81237142
## [13,]  2.92140323 -3.2158932 -0.2495083  1.75685170 -0.483621927  0.95552218
## [14,]  1.76433385 -0.2676727  3.0179157 -1.50384358 -0.327796923  0.13792239
## [15,] -0.40470772 -2.4703966 -0.7094904  0.84594281 -1.718259031 -0.94206040
## [16,]  6.60842467  2.9601643 -1.1341559  0.74759445  1.303764045  0.25973871
## [17,] -7.24397386  1.2560931  0.5751780  2.75333129  2.420006254  0.86759430
## [18,] -0.93034010  0.6734392 -4.3109883 -0.30407150  0.793466298 -0.02302702
## [19,] -0.82097155 -1.1918995  0.6741394  0.48864789  0.160725931 -1.47670651
##               PC7          PC8         PC9        PC10          PC11
##  [1,] -0.21668538 -0.591444868  0.30574819  0.52933023  0.0178062978
##  [2,]  0.66704555  0.875599882  1.04033298 -0.24616697 -0.2390348554
##  [3,]  0.19680973  0.305030467 -0.43084599  0.64867192 -0.0785395195
##  [4,]  0.16247084  0.592714100  0.58872160  0.47060876  0.0009765367
##  [5,]  0.27677089 -0.845943489 -0.89548619  0.46294148 -0.1532784260
##  [6,]  0.34330329 -0.006319592 -0.20851731  0.11463392 -0.4272194572
##  [7,] -1.75448111 -0.816365555  0.43331281 -0.36315797  0.0426013148
##  [8,]  0.43314064  0.148315419  0.45824001 -0.81231067 -0.0512350363
##  [9,]  0.25217730  0.604218173 -1.30125742 -0.80228294  0.8041078762
## [10,]  0.01774921 -0.065965393 -0.19038845 -0.50440204 -0.6992882007
## [11,] -0.15445306 -0.396438935  0.30604980 -0.46552860  0.1328410755
## [12,] -1.18121610  0.124385579 -0.45504733 -0.61235413 -1.0063092174
## [13,]  2.43761890 -0.661906136  0.36568322 -0.36680961  0.3194940098
## [14,] -0.54627793 -1.042603584 -0.05164923  0.54930986  0.3631999279
## [15,] -0.03691899  1.576502712 -0.41185706  0.70490048 -0.0903511229
## [16,] -1.56898748  0.572473179  0.32768907 -0.19880921  0.9791569124
## [17,] -0.18418076  0.018750740  0.56091277  0.74725845 -0.0316064790
## [18,]  0.50626948 -0.343438120 -0.50465821  0.21440677  0.0768640673
## [19,]  0.34984498 -0.047564581  0.06301675 -0.07023974  0.0398142959
##              PC12        PC13        PC14         PC15         PC16        PC17
##  [1,] -0.06888670  0.86088664 -0.25444701 -0.110820122  0.234183599  0.03767299
##  [2,] -0.23930748 -0.14303507  0.03733118  0.154976955  0.087169406  0.03372846
##  [3,]  0.60651879 -0.36280743 -0.13120301  0.164173624 -0.148649884 -0.30526816
##  [4,]  0.76410513  0.02245136  0.08769664 -0.476630455  0.024109814  0.03652202
##  [5,]  0.19093545  0.09656717  0.07395211  0.147247591 -0.334687858  0.33801636
##  [6,] -0.01779764 -0.05900122 -0.08287841 -0.165712269 -0.043368001 -0.03283016
##  [7,]  0.30085276  0.05419559  0.28098897  0.184637307  0.064910209 -0.04809626
##  [8,] -0.08997669  0.40444278 -0.15512003  0.079318118 -0.544025403 -0.31027229
##  [9,]  0.19496668  0.14981079 -0.05185524 -0.153761302  0.045042574  0.01380177
## [10,]  0.29053558 -0.04508967  0.18771798  0.255801685  0.369996223  0.02657094
## [11,] -0.09773438 -0.43813782 -0.75131531 -0.056680960  0.055373358  0.23647091
## [12,] -0.17255459 -0.02443721  0.01907914 -0.145536099 -0.063330098  0.03504450
## [13,]  0.10509357 -0.01279334  0.23092706  0.232587961  0.074595012  0.11261927
## [14,] -0.49747572 -0.26684021  0.10224691  0.087830190 -0.134730437 -0.15981492
## [15,] -0.48678387  0.16240773  0.02444873  0.326564376  0.005037615  0.12116061
## [16,]  0.03091136 -0.06470508  0.13247977  0.119361426  0.048486275  0.07774047
## [17,]  0.02686424 -0.08515724 -0.08144679  0.002476426 -0.089439063  0.05116034
## [18,] -0.32268285 -0.05843786 -0.08499120 -0.103686248  0.466315765 -0.33252768
## [19,] -0.51758363 -0.19031989  0.41638852 -0.542148205 -0.116989107  0.06830083
##               PC18         PC19
##  [1,]  0.162382898 2.394984e-15
##  [2,]  0.119979142 1.121731e-15
##  [3,]  0.291390291 2.896413e-15
##  [4,] -0.284191814 2.312846e-15
##  [5,]  0.050551294 1.638041e-15
##  [6,]  0.044673190 1.779717e-15
##  [7,]  0.079184615 1.183898e-15
##  [8,] -0.074466006 2.094358e-15
##  [9,] -0.052205259 1.156540e-15
## [10,] -0.082314760 2.533827e-15
## [11,]  0.002429587 2.087653e-15
## [12,] -0.072302008 2.636583e-15
## [13,] -0.042899923 2.345578e-15
## [14,] -0.288114682 3.211997e-15
## [15,] -0.092704244 2.488035e-15
## [16,]  0.081495643 3.076341e-15
## [17,] -0.068061386 1.863274e-15
## [18,] -0.027200887 1.838719e-15
## [19,]  0.252374309 2.633181e-15
  1. Perform Classical Multidimensional Scaling (MDS) using PCA results.
dist_matrix <- dist(svi[, -1])
mds_result <- cmdscale(dist_matrix, k = 3)  # k = number of dimensions
village_names <- svi[, 1]  
  1. Create a 2D scatterplot (map) of the villages based on the MDS results.
plot(mds_result, type = "n", main = "Classical MDS")
text(mds_result, labels = village_names, cex = 0.7)

  1. Run hierarchical cluster analysis using PCs from part a). Plot dendrogram.
pc_subset <- pca_scores[, 1:3]  # First 3 principal components
pc_dist <- dist(pc_subset)
hc <- hclust(pc_dist, method = "ward.D2")  # You can also try "complete", "average", etc.
plot(hc, labels = svi[, 1], main = "Hierarchical Clustering Dendrogram", xlab = "", sub = "", cex = 0.7)

  1. Compare the 2D MDS map and the hierarchical clustering structure.

    • Are the clusters consistent between MDS and hierarchical clustering?

    • Do the villages group similarly across methods?

    • Answering both questions: When comparing the 2D Classical MDS map and the hierarchical clustering dendrogram, we see a strong degree of consistency in how the villages group based on their social vulnerability characteristics. Both methods aim to identify patterns of similarity, but they visualize them differently: MDS preserves the pairwise distances between villages in a spatial layout, while hierarchical clustering builds a tree based on how villages group together incrementally.

      A clear example of agreement between the two methods is seen with the southern villages of Inarajan, Merizo, and Umatac. These villages are tightly clustered together on the left side of the MDS map and also form a distinct branch on the far left of the dendrogram, indicating high similarity and early merging. Similarly, Tamuning and Hagåtña stand out in both visualizations. They appear far from the rest of the villages in the MDS plot, suggesting they are quite distinct, and in the dendrogram, they merge at a higher level (i.e., greater dissimilarity), which reinforces their uniqueness.

      There is also a mid-group of central or northern villages, such as Yigo, Dededo, Mangilao, and M-T-M, that cluster together in both methods. On the MDS map, these villages appear relatively close in space, and in the dendrogram, they form a tight sub-cluster, showing consistency in how the methods identify shared characteristics. A similar consistency is observed among central villages like Yona, Talofofo, Ordot-Chalan Pago, Agana Heights, and Sinajana, which are grouped in the middle of the MDS space and are also grouped together in the middle section of the dendrogram.

      While the overall patterns are similar, there are slight differences in how some villages are grouped depending on the method. For instance, Agat appears spatially closer to the southern villages in the MDS map, but in the dendrogram, it joins a cluster with more urbanized areas like M-T-M and Mangilao, suggesting some ambiguity in its grouping. These differences are expected due to the different mathematical criteria used by each method.

      In summary, both the MDS and hierarchical clustering methods reveal broadly consistent clusters among villages, especially for the southernmost villages, urban centers, and mid-region communities. Although the specific distances and cluster boundaries may vary slightly, the overall structure of the groupings aligns well across the two approaches.

Problem 3:

species <- data.frame('D' = c(1000, 100, 940, 2040),
                      'E' = c(100, 10, 110, 220),
                      'F' = c(10, 1, 60, 71))

#Setting index (rownames).
rownames(species) <- c('Sample 1', 'Sample 2', 'Sample 3', 'Total')
  1. Find the distribution of species for each sample. Which samples do you think are close?
samples_only <- species[1:3, ]  #Only the first 3 rows.
species_distribution <- prop.table(as.matrix(samples_only), margin = 1)
round(species_distribution, 3)
##              D     E     F
## Sample 1 0.901 0.090 0.009
## Sample 2 0.901 0.090 0.009
## Sample 3 0.847 0.099 0.054
  1. Calculate the distances between samples using Bray.
dDE<-((abs(species[1,1] - species[1,2]) + abs(species[2,1] - species[2,2]) + abs(species[3,1] - species[3,2]))/ ((species[1,1] + species[1,2]) + (species[2,1] + species[2,2]) + (species[3,1] + species[3,2])))

print(dDE)
## [1] 0.8053097
dDF<-((abs(species[1,1] - species[1,3]) + abs(species[2,1] - species[2,3]) + abs(species[3,1] - species[3,3]))/ ((species[1,1] + species[1,3]) + (species[2,1] + species[2,3]) + (species[3,1] + species[3,3])))

print(dDF)
## [1] 0.9327333
dEF<-((abs(species[1,2] - species[1,3]) + abs(species[2,2] - species[2,3]) + abs(species[3,2] - species[3,3]))/ ((species[1,2] + species[1,3]) + (species[2,2] + species[2,3]) + (species[3,2] + species[3,3])))

print(dEF)
## [1] 0.5120275
  1. Transform the data by taking square root and Wisconsin double standardization.
# Data transformation
transformed_data <- wisconsin(sqrt(species[,]))
transformed_data
##                  D         E         F
## Sample 1 0.4001639 0.3853378 0.2144983
## Sample 2 0.4001639 0.3853378 0.2144983
## Sample 3 0.2944701 0.3067449 0.3987850
## Total    0.3333333 0.3333333 0.3333333
species[,]
##             D   E  F
## Sample 1 1000 100 10
## Sample 2  100  10  1
## Sample 3  940 110 60
## Total    2040 220 71
  1. Calculate the distances again.
dDE<-((abs(transformed_data[1,1] - transformed_data[1,2]) + abs(transformed_data[2,1] - transformed_data[2,2]) + abs(transformed_data[3,1] - transformed_data[3,2]))/ ((transformed_data[1,1] + transformed_data[1,2]) + (transformed_data[2,1] + transformed_data[2,2]) + (transformed_data[3,1] + transformed_data[3,2])))

print(dDE)
## [1] 0.01930144
dDF<-((abs(transformed_data[1,1] - transformed_data[1,3]) + abs(transformed_data[2,1] - transformed_data[2,3]) + abs(transformed_data[3,1] - transformed_data[3,3]))/ ((transformed_data[1,1] + transformed_data[1,3]) + (transformed_data[2,1] + transformed_data[2,3]) + (transformed_data[3,1] + transformed_data[3,3])))

print(dDF)
## [1] 0.2474
dEF<-((abs(transformed_data[1,2] - transformed_data[1,3]) + abs(transformed_data[2,2] - transformed_data[2,3]) + abs(transformed_data[3,2] - transformed_data[3,3]))/ ((transformed_data[1,2] + transformed_data[1,3]) + (transformed_data[2,2] + transformed_data[2,3]) + (transformed_data[3,2] + transformed_data[3,3])))

print(dEF)
## [1] 0.22765
  1. What do you conclude?