#install needed packages I have them preinstalled so are commented out
#install.packages('dendextend')
#install.packages('RColorBrewer')
#call needed libraries
suppressPackageStartupMessages(library(dendextend)) #hide messages from startup
suppressPackageStartupMessages(library(RColorBrewer))
Part 1 Fish Data Load the data and assign names
fish <- read.csv("lakefish.csv", header=T)
summary(fish)
## Species length height width
## Bream:35 Min. :10.80 Min. :14.50 Min. : 8.70
## Pike :17 1st Qu.:23.85 1st Qu.:16.90 1st Qu.:11.00
## Roach:20 Median :35.05 Median :27.20 Median :13.70
## Smelt:14 Mean :33.17 Mean :28.18 Mean :12.87
## 3rd Qu.:40.60 3rd Qu.:39.17 3rd Qu.:14.68
## Max. :68.00 Max. :44.50 Max. :16.10
Here we see the basic statistics of the fish in this data set. 1. Do a matrix of scatterplots for the 3 variables, and color the observations depending on their species.
#matrix of colors
cols<- c( 'red', 'blue', 'dark green' , 'black')
#matrix for the symbols
symbol<-c(10, 20, 3, 4)
pairs(fish[,2:4], pch= symbol[fish$Species], upper.panel = NULL, col = cols[fish$Species], main="Matrix plot of the fish species" )
legend("topright", legend=c("Bream", "Pike", "Roach", "Smelt"),pch= symbol, col=cols, xpd=TRUE)
As we see there there are 4 main Groups in the top graph but only 3 distinct groups for the lower two. Interestingly the 2 groups are not the same depending on what parameters are investigated.
fish_num <- fish[2:4]
Sdev <- apply(fish_num, 2, sd)
Sdfish<-sweep(fish_num, 2, Sdev, "/")
pairs(Sdfish, pch= symbol[fish$Species], upper.panel = NULL, col = cols[fish$Species],main="Fish Species Standardized by Standard Deviation")
legend("topright", legend=c("Bream", "Pike", "Roach", "Smelt"),pch= symbol, col=cols, cex=.75,xpd=TRUE)
While there are some changes in the grouping there is not enough to suggest that there will be much change in after standardising against SD
In this case the grouping are similar regardless whether raw or standard deviation so we can use any of the above sections to determine our dendrogram
lets use the Standardized data, with euclidean distances and complete groupings.
distancesd <- dist(Sdfish, method='euclidean')
completesd <- hclust(distancesd, method='complete')
dend<- as.dendrogram(completesd)
labels_colors(dend)<-cols[as.numeric(factor(fish$Species))]
plot(dend)
legend('topleft', legend = c('Bream', 'Pike', 'Roach', 'Smelt'), col=cols, pch=20, cex=0.75)
rect.hclust(completesd, h=3)
Here we see a nice dendrogram where each species has its own group( this can change depending on break point). suggesting 4 breaks pretty nicely ties in with the 4 know species to check how well this particulat clustering works
hclsd <- cutree(completesd, 4)
tabsd <- table(hclsd, fish$Species)
tabsd
##
## hclsd Bream Pike Roach Smelt
## 1 35 0 0 0
## 2 0 0 20 1
## 3 0 0 0 13
## 4 0 17 0 0
From this tabel we see that the grouping almost totally captured the entire catagorization of the species with only 1 errant Smelt spoiling the grouping
plot(Sdfish, col = cols[hclsd],upper.panel = NULL, pch=symbol[fish$Species], main="Plot of groupings of the Species of fish")
legend("topright", legend=c(c("Group 1", "Group 2", "Group 3", "Group 4")),pch= 20,cex=0.75, col=cols, xpd=TRUE)
legend("right",legend=c(c("Bream", "Pike", "Roach", "Smelt")),pch= symbol,cex=0.75, col='black', xpd=TRUE)
Here we see that the groups are almost identical to the earlier graphs showing how good the grouping is.
3.Cluster the 86 fish using k-means clustering. How many clusters would you suggest are present in this data set? Detail any decisions you make when running this procedure. Interpret the clusters obtained by comparing them to the first variable of the dataset concerning the fish species
wgss<- rep(0,10)
n <- nrow(fish)
wgss[1] <- (n-1) * sum(apply(fish, 2, var))
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
for(k in 2:10){
wgss[k] <- sum(kmeans(Sdfish, centers = k)$withinss)
}
plot(1:10, wgss, type = "b", xlab = "k", ylab = "Within group sum of squares", main="Within Group sum of square for cluster analysis")
As can be seen here the optimal 4 ( huge drop in WGSS from 3 to 4 which then flattens out ), similar to the heirarchical clustering, lets take 4 as the trade off for completeness v compitational time
As before lets use the standardized data to test.
k <- 4
kclsd <- kmeans(Sdfish, center = k)
tabksd <- table(kclsd$cluster, fish$Species)
tabksd
##
## Bream Pike Roach Smelt
## 1 0 17 0 0
## 2 0 0 20 1
## 3 35 0 0 0
## 4 0 0 0 13
Again we see that the grouping ( while numbering is different) is the same as the previous clustering, still one errant Smelt.
plot(Sdfish, col = kclsd$cluster,upper.panel = NULL, pch=symbol[fish$Species])
legend("topright", legend=c(c("Group 1", "Group 2", "Group 3", "Group 4")),pch= 20,cex=0.75, col=cols, xpd=TRUE)
legend("right",legend=c(c("Bream", "Pike", "Roach", "Smelt")),pch= symbol,cex=0.75, col='black', xpd=TRUE)
Once again we see a strong classifiaction ( groups slightly differently numbered but very similar) with the errant Smelt again appearing but everything else apparently classified correctly.
Part 2 Crime Data 4.Perform a Principal Component Analysis on the numeric variables of the dataset and produce a visualization of the principal components against the original variables. Interpret the results.
crime <- read.csv("EurostatCrime2015.csv", header=T)
crime_numbers <- crime[2:7]
summary(crime_numbers)
## Assault Intentional.homicide Rape Robbery
## Min. : 1.50 Min. :0.000 Min. : 0.800 Min. : 8.03
## 1st Qu.: 16.20 1st Qu.:0.810 1st Qu.: 2.658 1st Qu.: 22.34
## Median : 35.02 Median :0.985 Median : 5.335 Median : 39.48
## Mean : 92.61 Mean :1.428 Mean : 9.986 Mean : 52.66
## 3rd Qu.: 99.66 3rd Qu.:1.512 3rd Qu.:12.200 3rd Qu.: 55.48
## Max. :603.26 Max. :5.750 Max. :56.880 Max. :196.68
## Sexual.assault Theft
## Min. : 1.400 Min. : 108.4
## 1st Qu.: 7.543 1st Qu.: 520.4
## Median : 14.610 Median :1003.9
## Mean : 25.278 Mean :1183.1
## 3rd Qu.: 31.320 3rd Qu.:1649.8
## Max. :120.790 Max. :3828.0
#get a number of discrete colors so that the plot makes some form of sense( i dont see it )
n <- length(crime$Country)
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
#plot the data
pairs(crime_numbers, upper.panel = NULL, col=col_vector, pch=3, main="Plot of crime data by country")
legend('topright', col=col_vector, legend=crime$Country, cex=0.3, pch=10)
Looking at the graph there are large differences in the scale of the numbers so lets try to standardiaze the data before we run our PCA.
sds <- apply(crime_numbers, 2, sd)
crime.std <- sweep(crime_numbers, 2, sds, "/")
pairs(crime.std, upper.panel = NULL, col= col_vector,pch=3, main="Plot of crime data by country")
legend('topright', col= col_vector, legend=crime$Country, cex=0.3, pch=10)
pca <- prcomp(crime.std)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.6977 1.0871 1.0040 0.68082 0.63518 0.24674
## Proportion of Variance 0.4804 0.1970 0.1680 0.07725 0.06724 0.01015
## Cumulative Proportion 0.4804 0.6774 0.8454 0.92261 0.98985 1.00000
Lets plot this and see what it means
vars <- apply(pca$x, 2, var)
props <- vars / sum(vars)
cumsum <- cumsum(props)
par(mar = c(5, 5, 3, 5))
plot(props,type='b', col= "red", main="Scree plot of Principal Component Analysis", xlab="Principal component", ylab="Variance explained by Principal component")
par(new=T)
plot(cumsum, type='b', col='blue', xaxt='n', yaxt='n', xlab='', ylab='')
axis(4, ylim=c(0,max(cumsum)),col='black')
mtext('Cumulative variance for Principal components', side=4, line=3 )
legend("right", c('principal variances', 'cumulative variance'), col=c('red', 'blue'), pch=1, cex=0.6)
From this we can tell that see that components we need 4 components to explain 92% of the model, With component 1 explaining 48% and components 2 and 3 explaining less that 20% each Lets apply these components to the crime data we have.
new_crime<- predict(pca)
pairs(new_crime[,1:4], upper.panel = NULL, col = col_vector, main="Analysis of principal components by country")
legend('topright', col=col_vector, legend=crime$Country, cex=0.3, pch=20
)
as can be seen by this the overall shape and trends of the data remains similar but the number of components explaining is far less.