rm(list=ls())
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
pacman::p_load("data.table", "caTools", "ggplot2")
library(MASS)
levels(unclass(iris$Species))
## [1] "setosa" "versicolor" "virginica"
pairs(iris[1:4], main = "Iris Data -- 3 species", pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)], lower.panel=NULL, labels=c("SL","SW","PL","PW"), font.labels=2, cex.labels=4.5)
# now laod the given taring and test data
training_set <- read.csv("D:\\Google Drive\\Term3-2-DMG2\\Assignment 1\\datasets\\iris\\train.csv")
validation_set = read.csv("D:\\Google Drive\\Term3-2-DMG2\\Assignment 1\\datasets\\iris/test.csv")
#now scaling these 2 data sets
training_set[-5] = scale(training_set[-5])
validation_set[-5] = scale(validation_set[-5])
From above scatter plot it is evident that Versicolor and Virginica species are more similar to each other than Setosa.
training_set$SpeciesMetaClass <- 'virginica_versicolor'
training_set$SpeciesMetaClass[training_set$Species == "setosa"] <- "setosa"
#setDT(training_set)[Species=="virginica" | Species=="versicolor", SpeciesMetaClass:='virginica_versicolor']
#setDT(training_set)[Species=="setosa", SpeciesMetaClass:='setosa']
lda1 = lda(formula = SpeciesMetaClass ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width , data = training_set)
projecteddata1 <- as.matrix(training_set[,1:4])%*%lda1$scaling
projecteddata1df <- data.frame(projecteddata1,training_set$SpeciesMetaClass)
ggplot(projecteddata1df, aes(x=(1:length(LD1)), y=LD1, color=training_set.SpeciesMetaClass, shape=training_set.SpeciesMetaClass)) +
geom_point()+
labs(title="FDA : 2 Classes Setosa and Metaclass Projection",
x="Observations", y = "Fisher Discriminant")
# Now apply data on remaining 2 similar classes on training data
training_set_subset <- subset(training_set, SpeciesMetaClass =='virginica_versicolor')
lda2 <- lda(formula = Species ~ Sepal.Length+Sepal.Width+Petal.Length+Petal.Width, data = training_set_subset)
## Warning in lda.default(x, grouping, ...): group setosa is empty
# we will do this to get data frame from matrix
projecteddata2 <- as.matrix(training_set_subset[,1:4])%*%lda2$scaling
projecteddata2df <- data.frame(projecteddata2,training_set_subset$Species)
ggplot(projecteddata2df, aes(x=(1:length(LD1)), y=LD1, color=training_set_subset.Species, shape=training_set_subset.Species)) +
geom_point()+
labs(title="FDA : 2 Classes Virginica and Versicolor Projection",
x="Observations", y = "Fisher Discriminant")
fp1 <- predict(lda1,newdata = validation_set) # first projection
fp2 <- predict(lda2,newdata = validation_set) # second projection
fdadf <- data.frame(fp1$x, fp2$x,validation_set$Species)
ggplot(fdadf, aes(x=LD1, y=LD1.1, color=validation_set.Species, shape=validation_set.Species)) +
geom_point()+
labs(title="FDA 3 Classes Setosa, Virginica and Versicolor Projection",
x="FP1", y = " FP2")
Observation:
Simple technique like Fisher Discriminant helped us to do classification and reduce the dimensionality. We can get better picture about classes than using merely feature vs feature scatter plots.