#install needed packages I have them preinstalled so are commented out
#install.packages('dendextend')
#install.packages('RColorBrewer')
#call needed libraries
suppressPackageStartupMessages(library(dendextend)) #hide messages from startup
suppressPackageStartupMessages(library(RColorBrewer))

Part 1 Fish Data Load the data and assign names

fish <- read.csv("lakefish.csv", header=T)
summary(fish)
##   Species       length          height          width      
##  Bream:35   Min.   :10.80   Min.   :14.50   Min.   : 8.70  
##  Pike :17   1st Qu.:23.85   1st Qu.:16.90   1st Qu.:11.00  
##  Roach:20   Median :35.05   Median :27.20   Median :13.70  
##  Smelt:14   Mean   :33.17   Mean   :28.18   Mean   :12.87  
##             3rd Qu.:40.60   3rd Qu.:39.17   3rd Qu.:14.68  
##             Max.   :68.00   Max.   :44.50   Max.   :16.10

Here we see the basic statistics of the fish in this data set. 1. Do a matrix of scatterplots for the 3 variables, and color the observations depending on their species.

#matrix of colors 
cols<- c( 'red', 'blue', 'dark green' , 'black')
#matrix for the symbols
symbol<-c(10, 20, 3, 4) 
pairs(fish[,2:4], pch= symbol[fish$Species], upper.panel = NULL, col = cols[fish$Species], main="Matrix plot of the fish species"  )
legend("topright", legend=c("Bream", "Pike", "Roach", "Smelt"),pch= symbol, col=cols, xpd=TRUE)

As we see there there are 4 main Groups in the top graph but only 3 distinct groups for the lower two. Interestingly the 2 groups are not the same depending on what parameters are investigated.

  1. State whether you advise standardizing the data prior to analysis. Cluster the 86 fish using hierarchical clustering and complete linkage (without using the first variable concerning the fish species). From the dendrogram, how many clusters would you suggest are present in this data set? Cut the dendrogram at the desired number of clusters.
fish_num <- fish[2:4]
Sdev <- apply(fish_num, 2, sd)
Sdfish<-sweep(fish_num, 2, Sdev, "/")



pairs(Sdfish, pch= symbol[fish$Species], upper.panel = NULL, col = cols[fish$Species],main="Fish Species Standardized by Standard Deviation")
legend("topright", legend=c("Bream", "Pike", "Roach", "Smelt"),pch= symbol, col=cols, cex=.75,xpd=TRUE)

While there are some changes in the grouping there is not enough to suggest that there will be much change in after standardising against SD

In this case the grouping are similar regardless whether raw or standard deviation so we can use any of the above sections to determine our dendrogram

lets use the Standardized data, with euclidean distances and complete groupings.

distancesd <- dist(Sdfish, method='euclidean')
completesd <- hclust(distancesd, method='complete')

dend<- as.dendrogram(completesd)
labels_colors(dend)<-cols[as.numeric(factor(fish$Species))]
plot(dend)

legend('topleft', legend = c('Bream', 'Pike', 'Roach', 'Smelt'), col=cols, pch=20, cex=0.75)

rect.hclust(completesd, h=3)

Here we see a nice dendrogram where each species has its own group( this can change depending on break point). suggesting 4 breaks pretty nicely ties in with the 4 know species to check how well this particulat clustering works

hclsd <- cutree(completesd, 4)
tabsd <- table(hclsd, fish$Species)
tabsd
##      
## hclsd Bream Pike Roach Smelt
##     1    35    0     0     0
##     2     0    0    20     1
##     3     0    0     0    13
##     4     0   17     0     0

From this tabel we see that the grouping almost totally captured the entire catagorization of the species with only 1 errant Smelt spoiling the grouping

plot(Sdfish, col = cols[hclsd],upper.panel = NULL, pch=symbol[fish$Species], main="Plot of groupings of the Species of fish")
legend("topright", legend=c(c("Group 1", "Group 2", "Group 3", "Group 4")),pch= 20,cex=0.75, col=cols, xpd=TRUE)
legend("right",legend=c(c("Bream", "Pike", "Roach", "Smelt")),pch= symbol,cex=0.75, col='black', xpd=TRUE)

Here we see that the groups are almost identical to the earlier graphs showing how good the grouping is.

3.Cluster the 86 fish using k-means clustering. How many clusters would you suggest are present in this data set? Detail any decisions you make when running this procedure. Interpret the clusters obtained by comparing them to the first variable of the dataset concerning the fish species

wgss<- rep(0,10)
n <- nrow(fish)
wgss[1] <- (n-1) * sum(apply(fish, 2, var))
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
for(k in 2:10){
 wgss[k] <- sum(kmeans(Sdfish, centers = k)$withinss)
}
plot(1:10, wgss, type = "b", xlab = "k", ylab = "Within group sum of squares", main="Within Group sum of square for cluster analysis")

As can be seen here the optimal 4 ( huge drop in WGSS from 3 to 4 which then flattens out ), similar to the heirarchical clustering, lets take 4 as the trade off for completeness v compitational time

As before lets use the standardized data to test.

k <- 4
kclsd <- kmeans(Sdfish, center = k)
tabksd <- table(kclsd$cluster, fish$Species)
tabksd
##    
##     Bream Pike Roach Smelt
##   1     0   17     0     0
##   2     0    0    20     1
##   3    35    0     0     0
##   4     0    0     0    13

Again we see that the grouping ( while numbering is different) is the same as the previous clustering, still one errant Smelt.

plot(Sdfish, col = kclsd$cluster,upper.panel = NULL, pch=symbol[fish$Species])
legend("topright", legend=c(c("Group 1", "Group 2", "Group 3", "Group 4")),pch= 20,cex=0.75, col=cols, xpd=TRUE)
legend("right",legend=c(c("Bream", "Pike", "Roach", "Smelt")),pch= symbol,cex=0.75, col='black', xpd=TRUE)

Once again we see a strong classifiaction ( groups slightly differently numbered but very similar) with the errant Smelt again appearing but everything else apparently classified correctly.

Part 2 Crime Data 4.Perform a Principal Component Analysis on the numeric variables of the dataset and produce a visualization of the principal components against the original variables. Interpret the results.

crime <- read.csv("EurostatCrime2015.csv", header=T)
crime_numbers <- crime[2:7]

summary(crime_numbers)
##     Assault       Intentional.homicide      Rape           Robbery      
##  Min.   :  1.50   Min.   :0.000        Min.   : 0.800   Min.   :  8.03  
##  1st Qu.: 16.20   1st Qu.:0.810        1st Qu.: 2.658   1st Qu.: 22.34  
##  Median : 35.02   Median :0.985        Median : 5.335   Median : 39.48  
##  Mean   : 92.61   Mean   :1.428        Mean   : 9.986   Mean   : 52.66  
##  3rd Qu.: 99.66   3rd Qu.:1.512        3rd Qu.:12.200   3rd Qu.: 55.48  
##  Max.   :603.26   Max.   :5.750        Max.   :56.880   Max.   :196.68  
##  Sexual.assault        Theft       
##  Min.   :  1.400   Min.   : 108.4  
##  1st Qu.:  7.543   1st Qu.: 520.4  
##  Median : 14.610   Median :1003.9  
##  Mean   : 25.278   Mean   :1183.1  
##  3rd Qu.: 31.320   3rd Qu.:1649.8  
##  Max.   :120.790   Max.   :3828.0
#get a number of discrete colors so that the plot makes some form of sense( i dont see it )
n <- length(crime$Country)
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
#plot the data
pairs(crime_numbers, upper.panel = NULL, col=col_vector, pch=3, main="Plot of crime data by country")
legend('topright', col=col_vector, legend=crime$Country, cex=0.3, pch=10)

Looking at the graph there are large differences in the scale of the numbers so lets try to standardiaze the data before we run our PCA.

sds <- apply(crime_numbers, 2, sd)
crime.std <- sweep(crime_numbers, 2, sds, "/")
pairs(crime.std, upper.panel = NULL, col= col_vector,pch=3, main="Plot of crime data by country")
legend('topright', col= col_vector, legend=crime$Country, cex=0.3, pch=10)

pca <- prcomp(crime.std)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.6977 1.0871 1.0040 0.68082 0.63518 0.24674
## Proportion of Variance 0.4804 0.1970 0.1680 0.07725 0.06724 0.01015
## Cumulative Proportion  0.4804 0.6774 0.8454 0.92261 0.98985 1.00000

Lets plot this and see what it means

vars <- apply(pca$x, 2, var)  
props <- vars / sum(vars)
cumsum <- cumsum(props)
par(mar = c(5, 5, 3, 5))    

plot(props,type='b', col= "red", main="Scree plot of Principal Component Analysis", xlab="Principal component", ylab="Variance explained by Principal component")
par(new=T)
plot(cumsum, type='b', col='blue', xaxt='n', yaxt='n', xlab='', ylab='')
axis(4, ylim=c(0,max(cumsum)),col='black')
mtext('Cumulative variance for Principal components', side=4, line=3 )
legend("right", c('principal variances', 'cumulative variance'), col=c('red', 'blue'), pch=1, cex=0.6)

From this we can tell that see that components we need 4 components to explain 92% of the model, With component 1 explaining 48% and components 2 and 3 explaining less that 20% each Lets apply these components to the crime data we have.

new_crime<- predict(pca)
pairs(new_crime[,1:4], upper.panel = NULL, col = col_vector, main="Analysis of principal components by country")
legend('topright', col=col_vector, legend=crime$Country, cex=0.3, pch=20
       )

as can be seen by this the overall shape and trends of the data remains similar but the number of components explaining is far less.