Import the dataset

library(readr)
Migrants <- read_csv("~/Downloads/CS 583/R_Project/Migrants.csv")
## Rows: 90 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (7): rownames, migrants, distance, pops66, pops71, popd66, popd71
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Migrants
## # A tibble: 90 × 7
##    rownames migrants distance  pops66  pops71 popd66 popd71
##       <dbl>    <dbl>    <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
##  1        1      255      924  108535  111641 493396 522104
##  2        2     2380      952  756039  788960 493396 522104
##  3        3     1140     1119  616788  534557 493396 522104
##  4        4     2145     1641 5780845 6027764 493396 522104
##  5        5     6295     1996 6960870 7703106 493396 522104
##  6        6      215     3159  963066  988247 493396 522104
##  7        7      185     3542  955344  926242 493396 522104
##  8        8      425     4059 1463203 1627874 493396 522104
##  9        9      425     4838 1873674 2184621 493396 522104
## 10       10      340      924  493396  522104 108535 111641
## # ℹ 80 more rows

Statistical Analysis

The dataset is saved as into a new variable to avoid overriding the original dataset.

mg<-Migrants
str(mg)
## spc_tbl_ [90 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ rownames: num [1:90] 1 2 3 4 5 6 7 8 9 10 ...
##  $ migrants: num [1:90] 255 2380 1140 2145 6295 ...
##  $ distance: num [1:90] 924 952 1119 1641 1996 ...
##  $ pops66  : num [1:90] 108535 756039 616788 5780845 6960870 ...
##  $ pops71  : num [1:90] 111641 788960 534557 6027764 7703106 ...
##  $ popd66  : num [1:90] 493396 493396 493396 493396 493396 ...
##  $ popd71  : num [1:90] 522104 522104 522104 522104 522104 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   rownames = col_double(),
##   ..   migrants = col_double(),
##   ..   distance = col_double(),
##   ..   pops66 = col_double(),
##   ..   pops71 = col_double(),
##   ..   popd66 = col_double(),
##   ..   popd71 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
View(mg)
summary(mg)
##     rownames        migrants          distance        pops66       
##  Min.   : 1.00   Min.   :   95.0   Min.   : 164   Min.   : 108535  
##  1st Qu.:23.25   1st Qu.:  912.5   1st Qu.: 924   1st Qu.: 616788  
##  Median :45.50   Median : 3087.5   Median :1763   Median : 959205  
##  Mean   :45.50   Mean   : 9227.3   Mean   :1945   Mean   :1997176  
##  3rd Qu.:67.75   3rd Qu.:10291.2   3rd Qu.:2940   3rd Qu.:1873674  
##  Max.   :90.00   Max.   :99430.0   Max.   :4838   Max.   :6960870  
##      pops71            popd66            popd71       
##  Min.   : 111641   Min.   : 108535   Min.   : 111641  
##  1st Qu.: 534557   1st Qu.: 616788   1st Qu.: 534557  
##  Median : 957244   Median : 959205   Median : 957244  
##  Mean   :2141512   Mean   :1997176   Mean   :2141512  
##  3rd Qu.:2184621   3rd Qu.:1873674   3rd Qu.:2184621  
##  Max.   :7703106   Max.   :6960870   Max.   :7703106

Check for any NA’s and remove if there’s any.

The rowname column is also dropped as it is not needed for the analysis

#check for na
any(is.na(mg))
## [1] FALSE
#drop the rownames column
mg1<-mg[-1]
View(mg1)
Data type

List the data types for each attribute.

dim(mg1)
## [1] 90  6
sapply(mg1, class)
##  migrants  distance    pops66    pops71    popd66    popd71 
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
Standard deviation

The standard deviation is gotten for all attributes.

Low/small standard deviation indicates that the data is clustered tightly around the mean.

Larger standard deviation indicates that the data is more spread out.

sapply(mg1[,1:6], sd)
##    migrants    distance      pops66      pops71      popd66      popd71 
##   15180.645    1218.896 2263918.696 2468572.880 2263918.696 2468572.880
Skew

Larger/smaller deviations from 0 show more skew.

Based on the plot we know the skew is a positive skew.

library(mlbench)
library(e1071)
skew<-apply(mg1[,1:6], 2, skewness)
print(skew)
##  migrants  distance    pops66    pops71    popd66    popd71 
## 3.2352733 0.4086444 1.3736931 1.3828426 1.3736931 1.3828426
plot(skew, type="l", col='black')

Correlation

From the Correlation results, we can say(might not be true) the following:

  • Migrants has a moderate positve correlation with both popd66 and popd71 (.37 and .39) which might suggest that higher populations at the destination correspond with more migrants.

  • Distance shows a moderate negative correlation with migrants which might indicate that greater distances might be associated with fewer migrants.

  • There are very high positive correlations between pops66 and pops71, and also between podd66 and popd71, close to 1 which could indicate a very strong relationship between the populations in consecutive years, both at the source and destination.

  • The correlations between the populations at the source and at the destination are negative but weak, suggesting minimal direct linear relationship between these variables.

mg1correlation<- cor(mg1[,1:6])
print(mg1correlation)
##            migrants   distance     pops66     pops71     popd66     popd71
## migrants  1.0000000 -0.3065114  0.3369534  0.3362471  0.3769860  0.3926451
## distance -0.3065114  1.0000000 -0.1251918 -0.1146829 -0.1251918 -0.1146829
## pops66    0.3369534 -0.1251918  1.0000000  0.9988841 -0.1111111 -0.1109871
## pops71    0.3362471 -0.1146829  0.9988841  1.0000000 -0.1109871 -0.1111111
## popd66    0.3769860 -0.1251918 -0.1111111 -0.1109871  1.0000000  0.9988841
## popd71    0.3926451 -0.1146829 -0.1109871 -0.1111111  0.9988841  1.0000000
plot(mg1correlation, type="b", col="black")

Scaling and Normalizing the data
suppressPackageStartupMessages(library(caret))
summary(mg1[,1:6])
##     migrants          distance        pops66            pops71       
##  Min.   :   95.0   Min.   : 164   Min.   : 108535   Min.   : 111641  
##  1st Qu.:  912.5   1st Qu.: 924   1st Qu.: 616788   1st Qu.: 534557  
##  Median : 3087.5   Median :1763   Median : 959205   Median : 957244  
##  Mean   : 9227.3   Mean   :1945   Mean   :1997176   Mean   :2141512  
##  3rd Qu.:10291.2   3rd Qu.:2940   3rd Qu.:1873674   3rd Qu.:2184621  
##  Max.   :99430.0   Max.   :4838   Max.   :6960870   Max.   :7703106  
##      popd66            popd71       
##  Min.   : 108535   Min.   : 111641  
##  1st Qu.: 616788   1st Qu.: 534557  
##  Median : 959205   Median : 957244  
##  Mean   :1997176   Mean   :2141512  
##  3rd Qu.:1873674   3rd Qu.:2184621  
##  Max.   :6960870   Max.   :7703106
preprocessParams<-preProcess(mg1[,1:6], method=c("scale"))
print(preprocessParams)
## Created from 90 samples and 6 variables
## 
## Pre-processing:
##   - ignored (0)
##   - scaled (6)
transformed_mg1<-predict(preprocessParams, mg1[,1:6])
summary(transformed_mg1)
##     migrants           distance          pops66            pops71       
##  Min.   :0.006258   Min.   :0.1345   Min.   :0.04794   Min.   :0.04522  
##  1st Qu.:0.060109   1st Qu.:0.7581   1st Qu.:0.27244   1st Qu.:0.21654  
##  Median :0.203384   Median :1.4464   Median :0.42369   Median :0.38777  
##  Mean   :0.607835   Mean   :1.5958   Mean   :0.88218   Mean   :0.86751  
##  3rd Qu.:0.677919   3rd Qu.:2.4120   3rd Qu.:0.82762   3rd Qu.:0.88497  
##  Max.   :6.549788   Max.   :3.9692   Max.   :3.07470   Max.   :3.12047  
##      popd66            popd71       
##  Min.   :0.04794   Min.   :0.04522  
##  1st Qu.:0.27244   1st Qu.:0.21654  
##  Median :0.42369   Median :0.38777  
##  Mean   :0.88218   Mean   :0.86751  
##  3rd Qu.:0.82762   3rd Qu.:0.88497  
##  Max.   :3.07470   Max.   :3.12047
summary(mg1)
##     migrants          distance        pops66            pops71       
##  Min.   :   95.0   Min.   : 164   Min.   : 108535   Min.   : 111641  
##  1st Qu.:  912.5   1st Qu.: 924   1st Qu.: 616788   1st Qu.: 534557  
##  Median : 3087.5   Median :1763   Median : 959205   Median : 957244  
##  Mean   : 9227.3   Mean   :1945   Mean   :1997176   Mean   :2141512  
##  3rd Qu.:10291.2   3rd Qu.:2940   3rd Qu.:1873674   3rd Qu.:2184621  
##  Max.   :99430.0   Max.   :4838   Max.   :6960870   Max.   :7703106  
##      popd66            popd71       
##  Min.   : 108535   Min.   : 111641  
##  1st Qu.: 616788   1st Qu.: 534557  
##  Median : 959205   Median : 957244  
##  Mean   :1997176   Mean   :2141512  
##  3rd Qu.:1873674   3rd Qu.:2184621  
##  Max.   :6960870   Max.   :7703106
preprocessParams<-preProcess(transformed_mg1[,1:6], method=c("range"))
print(preprocessParams)
## Created from 90 samples and 6 variables
## 
## Pre-processing:
##   - ignored (0)
##   - re-scaling to [0, 1] (6)
transformed_mg2<-predict(preprocessParams, transformed_mg1[,1:6])
summary(transformed_mg2)
##     migrants          distance          pops66            pops71       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00823   1st Qu.:0.1626   1st Qu.:0.07417   1st Qu.:0.05571  
##  Median :0.03013   Median :0.3421   Median :0.12414   Median :0.11139  
##  Mean   :0.09193   Mean   :0.3811   Mean   :0.27562   Mean   :0.26739  
##  3rd Qu.:0.10265   3rd Qu.:0.5939   3rd Qu.:0.25760   3rd Qu.:0.27307  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##      popd66            popd71       
##  Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.07417   1st Qu.:0.05571  
##  Median :0.12414   Median :0.11139  
##  Mean   :0.27562   Mean   :0.26739  
##  3rd Qu.:0.25760   3rd Qu.:0.27307  
##  Max.   :1.00000   Max.   :1.00000

Hierarchical Clustering

We are going to try multiple cluster values in order to determine the best/optimal number of clusters.

library(cluster)
set.seed(1234)
dist_mat<-dist(transformed_mg2, method = 'euclidean')
methds<-c('complete','single','average')
avgS<-matrix(NA, ncol=3, nrow=5, dimnames=list(2:6, methds))
for(k in 2:6){
  for (m in seq_along(methds)){
    h<-hclust(dist_mat, meth=methds[m])
    c<-cutree(h,k)
    s<-silhouette(c,dist_mat)
    avgS[k-1, m]=mean(s[,3])
  }
}
avgS
##    complete    single   average
## 2 0.4974456 0.4490366 0.4490366
## 3 0.6117387 0.4811353 0.4826988
## 4 0.6315946 0.6315946 0.6315946
## 5 0.5546771 0.6161266 0.6161266
## 6 0.5025381 0.5670597 0.5670597
plot(s)

Based on the silhouette result, cluster 3 is the best cluster because it is closer/equal to 1 which means that the data is well matched in the cluster assignment.

dist_mat2<-dist(transformed_mg2, method='euclidean')
h_clust<-hclust(dist_mat2, method = 'average')
#visualize the dendogram
plot(h_clust)
h_cut<-cutree(h_clust, k =3)
#visualize the clusters
plot(h_clust)
rect.hclust(h_clust, k=3 ,border = 2:6)
abline(h=4, col='blue')

Visualizing all branches
suppressPackageStartupMessages(library(dendextend))
avg_dend_obj <- as.dendrogram(h_clust)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)

Evaluating the Clustering Results (might be removed)

This is done by comparing the results of the original cluster with the label from the original data using the mutate function.

Initially we append the cluster results obtained to the original dataframe by creating a new column labelled cluster and store the dataframe in migrants_cl.

suppressPackageStartupMessages(library(dplyr))
#library(dplyr)
migrants_cl<-mutate(transformed_mg2, cluster = h_cut)
migrants_cl
## # A tibble: 90 × 7
##    migrants distance pops66 pops71 popd66 popd71 cluster
##       <dbl>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <int>
##  1 0.00161     0.163 0      0      0.0562 0.0541       1
##  2 0.0230      0.169 0.0945 0.0892 0.0562 0.0541       1
##  3 0.0105      0.204 0.0742 0.0557 0.0562 0.0541       1
##  4 0.0206      0.316 0.828  0.779  0.0562 0.0541       2
##  5 0.0624      0.392 1      1      0.0562 0.0541       2
##  6 0.00121     0.641 0.125  0.115  0.0562 0.0541       1
##  7 0.000906    0.723 0.124  0.107  0.0562 0.0541       1
##  8 0.00332     0.833 0.198  0.200  0.0562 0.0541       1
##  9 0.00332     1     0.258  0.273  0.0562 0.0541       1
## 10 0.00247     0.163 0.0562 0.0541 0      0            1
## # ℹ 80 more rows
count(migrants_cl, cluster)
## # A tibble: 3 × 2
##   cluster     n
##     <int> <int>
## 1       1    72
## 2       2    16
## 3       3     2
Validating the results

Since the dataset doesnt have labels we’ll be using the dunn’s index for validation.

library(clValid)
dunn(dist_mat2, h_cut)
## [1] 0.4718926

Based on the result gotten, we have a low dunn’s index(.47) which indicates less-compact and less well-separated clusters.

Kmeans Clustering

Same as with the Hierarchical Clustering method, we’ll try different values of k.

We’ll use the pamk() function in the fpc library to search for the ideal number of clusters and select the best using “asw” (absolute silhouette width).

library(fpc)
sol<-pamk(transformed_mg2, krange=2:6, criterion="asw", usepam=TRUE)
sol
## $pamobject
## Medoids:
##      ID   migrants  distance    pops66    pops71     popd66     popd71
## [1,] 24 0.01741582 0.4679076 0.1247065 0.1154726 0.09449392 0.08922112
## [2,] 69 0.06795188 0.3421053 1.0000000 1.0000000 0.12357963 0.10730485
## [3,] 52 0.11788393 0.3421053 0.1235796 0.1073048 1.00000000 1.00000000
## Clustering vector:
##  [1] 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 3 3
## [39] 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1
## [77] 2 2 1 1 1 1 1 1 1 2 2 1 1 1
## Objective function:
##     build      swap 
## 0.3221549 0.3221549 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"      
## 
## $nc
## [1] 3
## 
## $crit
## [1] 0.0000000 0.4974456 0.6171584 0.5270395 0.4226829 0.4468736

The outout from this gave an nc of 3 making it the optimal number of clusters.

set.seed(1234)
mg3<-kmeans(transformed_mg2, center=3, iter.max=200)
mg3
## K-means clustering with 3 clusters of sizes 56, 17, 17
## 
## Cluster means:
##     migrants  distance    pops66    pops71    popd66    popd71
## 1 0.05797841 0.4287090 0.1160510 0.1118216 0.1160510 0.1118216
## 2 0.15828199 0.3025875 0.1579182 0.1510858 0.9189611 0.8961470
## 3 0.13744341 0.3025875 0.9189611 0.8961470 0.1579182 0.1510858
## 
## Clustering vector:
##  [1] 1 1 1 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 2 2
## [39] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 3 3 1 1 1 1 1 1 1 3 3 1 1 1 1 1 1 1
## [77] 3 3 1 1 1 1 1 1 1 3 3 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 6.791390 2.801417 2.271840
##  (between_SS / total_SS =  74.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Notes for output:

Cluster Validation

Silhouette coefficient is used to evaluate cluster compactness and cluster separation.

library(cluster)
migrants_s<-silhouette(mg3$cluster, dist(transformed_mg2))
plot(migrants_s)