Load the good plotting package
require(ggplot2)
## Loading required package: ggplot2
instar.measurements <- read.table("~/Downloads/instar measurements.csv", sep="," , h=TRUE) #read a document where the seperators are ",", and which has a header row (h=TRUE); and save this into the new variable instar.measurements
head(instar.measurements) #take a look at the first few rows with the head command
## Document sex instar eye femur pronotum femur.eye
## 1 P1220507.psd NA 198.73261 628.6017 200.56008 3.163053
## 2 P1220949.psd NA 155.36288 521.1288 176.89872 3.354269
## 3 P1220949.psd NA 152.89075 499.6399 177.44362 3.267954
## 4 DSCN9670.psd NA 79.71865 296.0612 95.74064 3.713825
## 5 DSCN9674.psd NA 89.80126 287.6240 88.67164 3.202895
## 6 P1230134.psd 1 105.16348 370.9281 112.41112 3.527157
## femur.pronotum pronotum.femur.x.10 pronotum.eye date duration
## 1 3.134232 3.190575 1.009196 06/13/2014 23:3
## 2 2.945916 3.394530 1.138616 06/21/2014 24:4
## 3 2.815767 3.551430 1.160591 06/21/2014 24:4
## 4 3.092325 3.233813 1.200982 6/23/2009 24:6
## 5 3.243697 3.082902 0.987421 6/23/2009 24:6
## 6 3.299746 3.030537 1.068918 06/25/2014 25:1
## day.of.year
## 1 164
## 2 172
## 3 172
## 4 174
## 5 174
## 6 176
start plotting right away!
ggplot(instar.measurements,aes(femur.eye,femur.pronotum)) + # set up the plot with femur.eye on x axis, and femur.pronotum on the y. The plus indicates there's more to come and R shouldn't evaluate yet.
geom_point(alpha=I(0.3)) + #plot as points with 50% transparency
theme_bw() #a nice clean chart style
Looks good, but we can add some things:
date
ggplot(instar.measurements,aes(femur.eye,femur.pronotum)) +
geom_point(aes(size=day.of.year),alpha=I(0.3)) + #add size by day of year
theme_bw()
#plotting date make it clear that the relationship (at least of femur/eye), is going in the directions we'd expect.
instar
ggplot(instar.measurements,aes(femur.eye,femur.pronotum)) +
geom_point(aes(size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar)) + #add text labels for instar
theme_bw()
sex
ggplot(instar.measurements,aes(femur.eye,femur.pronotum)) +
geom_point(aes(size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar)) +
geom_point(data=subset(instar.measurements, sex!=""),aes(colour=sex,size=day.of.year),pch=1) + #ignoring the points for which sex is blank, draw rings (pch=1), appropriately size, whose color indicates sex.
theme_bw()
## Warning: Removed 27 rows containing missing values (geom_text).
Alternative axes
ggplot(instar.measurements,aes(femur.eye,pronotum.eye)) +
geom_point(aes(size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar)) +
geom_point(data=subset(instar.measurements, sex!=""),aes(colour=sex,size=day.of.year),pch=1) +
theme_bw()
Alternative axes
ggplot(instar.measurements,aes(femur.pronotum,pronotum.eye)) +
geom_point(aes(size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar)) +
geom_point(data=subset(instar.measurements, sex!=""),aes(colour=sex,size=day.of.year),pch=1) +
theme_bw()
Just for fun, lets see what K-means clustering does with it. First with the best axes, which is maybe femur.eye? We’ll drop sex for now and set coour to equal the categorization from Kmeans
clusters<-kmeans(instar.measurements$femur.eye, #what data
6, #how many groups
iter.max=50, #default algorithm settings
nstart=50
)
ggplot(instar.measurements, aes(femur.eye,0)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar))
Here’s Kmeans clustering considering that second data axis as well (Petal.Width).
clusters<-kmeans(instar.measurements[,c(7,10)], #consider columns 7 & 10
6,
iter.max=50,
nstart=50
)
ggplot(instar.measurements, aes(femur.eye,pronotum.eye)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar))
Addiing in all the axes (3 morphological ratios and day of year), scaling first to make sure they’re comparable units.
data <- instar.measurements[,c(7,8,10,13)]
data <- scale(data)
clusters<-kmeans(data,
6,
iter.max=50,
nstart=50
)
ggplot(instar.measurements, aes(femur.eye,pronotum.eye)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar))
To me, this seems to do extremely well – by adding in day. of year, we’re now identifing all of the first-instar individuals in one group (blue), and the adults are all put in two groups (green and olive, which by the way look to be at least loosely associated with sex)
This becomes more clear by plotting day of year on the x axis:
ggplot(instar.measurements, aes(day.of.year,pronotum.eye)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=4),alpha=I(0.3)) +
geom_text(aes(label=instar))
So here’s the final thing that maybe gets at your question (identify the three middle instars). This is all the data columns, with adult rows removed for now to make the clustering algorithm’s job easier. We’ll run the clusters once, and visualize them by applying them to the data plotted along several different pairs of axes.
data <- subset(instar.measurements,instar<5|is.na(instar))
ddata <- data[,c(7,8,10,13)]
cdata <- scale(ddata)
clusters<-kmeans(cdata,
4,
iter.max=50,
nstart=50
)
ggplot(data, aes(femur.eye,pronotum.eye)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar))
ggplot(data, aes(pronotum.eye,femur.pronotum)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=day.of.year),alpha=I(0.3)) +
geom_text(aes(label=instar))
ggplot(data,aes(day.of.year,femur.eye)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=4),alpha=I(0.3)) +
geom_text(aes(label=data$instar))
Finally, with ordiation we can reduced our dimesions down to two, and so get a (slightly squished) 2D view of the whole thing, which is presumably more similar to 4-dimensional shape in which Kmeans is working:
require(vegan) #laod ordination package
## Loading required package: vegan
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.3-2
hopMDS <- metaMDS(cdata+2) #run a non-metric multidimensional scaling algorithm on our data
## Run 0 stress 0.08107029
## Run 1 stress 0.08133139
## ... procrustes: rmse 0.03828469 max resid 0.1331709
## Run 2 stress 0.08133137
## ... procrustes: rmse 0.03826607 max resid 0.1331509
## Run 3 stress 0.1033685
## Run 4 stress 0.09217701
## Run 5 stress 0.1070942
## Run 6 stress 0.3969992
## Run 7 stress 0.1116534
## Run 8 stress 0.1127498
## Run 9 stress 0.09545618
## Run 10 stress 0.1027006
## Run 11 stress 0.0961535
## Run 12 stress 0.08137107
## ... procrustes: rmse 0.02258616 max resid 0.07050449
## Run 13 stress 0.08266918
## Run 14 stress 0.08087881
## ... New best solution
## ... procrustes: rmse 0.01182668 max resid 0.04452813
## Run 15 stress 0.1144524
## Run 16 stress 0.09285081
## Run 17 stress 0.09217758
## Run 18 stress 0.09532568
## Run 19 stress 0.08054607
## ... New best solution
## ... procrustes: rmse 0.03543392 max resid 0.1286653
## Run 20 stress 0.0813314
ggplot(data.frame(hopMDS$points), aes(MDS1,MDS2)) +
geom_point(aes(colour=as.factor(clusters$cluster),size=4),alpha=I(0.3)) +
geom_text(aes(label=data$instar))
We can plot the same ordination with index numbers:
plot(hopMDS,type="t",display="sites")
ordispider(hopMDS,clusters$cluster)
And here are the file names for these:
data.frame(data$Document,clusters$cluster)
## data.Document clusters.cluster
## 1 P1220507.psd 2
## 2 P1220949.psd 2
## 3 P1220949.psd 2
## 4 DSCN9670.psd 2
## 5 DSCN9674.psd 2
## 6 P1230134.psd 2
## 7 P1230162.psd 3
## 8 P1230225.psd 3
## 9 P1230234.psd 2
## 10 P1230284.psd 3
## 11 P1230322.psd 3
## 12 P1230379.psd 2
## 13 P1230381.psd 2
## 14 P1230415.psd 3
## 15 P1230150.psd 3
## 16 P1230808.psd 2
## 17 P1230810.psd 2
## 18 DSCN0479.psd 2
## 19 DSCN1153.psd 2
## 20 DSCN1324.psd 1
## 21 DSCN1337.psd 1
## 22 DSCN1347.psd 1
## 23 DSCN1431.psd 2
## 24 DSCN1441.psd 1
## 25 DSCN1459.psd 2
## 26 P1250452.psd 1
## 27 P1250509.psd 1
## 28 P1250509.psd 1
## 29 DSCN1858.psd 1
## 30 DSCN9740.psd 3
## 31 DSCN9824.psd 2
## 32 P1040583.psd 1
## 33 P1040648.psd 1
## 34 DSCN5600.psd 4
## 35 DSCN5602.psd 4
## 40 P1160143.psd 4
## 46 DSCN7458.psd 4