setwd(“C:/Users/pullannagari.manasa/Desktop”) irisData <- read.csv(“C:/Users/pullannagari.manasa/Desktop/irisData.csv”)

structure of data(types of variables)

str(irisData)

univariate analysis

table(irisData$class)

hist(irisData$sepal.length.in.cm.)

hist(irisData$sepal.width.in.cm)

hist(irisData$petal.length.in.cm)

hist(irisData$petal.width.in.cm)

plot(irisData$petal.width.in.cm)

install.packages(“pastecs”)

library(pastecs)

stat.desc(irisData)

stat.desc(irisData[,c(“sepal.length.in.cm.”,“sepal.width.in.cm”,“petal.length.in.cm”,“petal.width.in.cm”)])

Pearson’s Chi-squared test(to know the relation between variables(H0: null hypothesis is no relation, to reject that we need we need to get p<0.05))

##chisq.test(irisData$sepal.length.in.cm.,irisData$class)

#### OUTLIER DETECTION AND IMPUTATION ####

##for missing values that lie outside the 1.5 * IQR limits, we could cap it by replacing those observations

outside the lower limit with the value of 5th %ile and those that lie above the upper limit, with the value of 95th %ile.

boxplot(irisData)

x <- irisData$sepal.width.in.cm

qnt <- quantile(x, probs=c(.25, .75), na.rm = T)

caps <- quantile(x, probs=c(.05, .95), na.rm = T)

H <- 1.5 * IQR(x, na.rm = T)

x[x < (qnt[1] - H)] <- caps[1]

x[x > (qnt[2] + H)] <- caps[2] hist(x)

irisData$sepal.width.in.cm<-x

### Missing Value Imputation ###

library(“missForest”)

library(“mice”)

Generate 10% missing values at Random

iris.mis <- prodNA(irisData[,-5], noNA = 0.1)

Check missing values introduced in the data

summary(iris.mis)

stat.desc(iris.mis)

encoding categorical variable with dummy.code() in psych

library(“psych”)

new <- dummy.code(iris.mis$class)

new.sat <- data.frame(new,iris.mis)

new.sat<-new.sat[,-8]

round(cor(new.sat,use=“pairwise”),2)

install MIC

library(mice)

md.pattern() returns a tabular form of missing value present in each variable in a data set.

md.pattern(iris.mis)

plotting missing values.

library(VIM) mice_plot <- aggr(iris.mis, col=c(‘navyblue’,‘yellow’), numbers=TRUE, sortVars=TRUE, labels=names(iris.mis), cex.axis=.7,gap=3, ylab=c(“Missing data”,“Pattern”))

imputing missing values

imputed_Data <- mice(iris.mis, m=5, maxit = 50, method = ‘pmm’, seed = 500)

summary(imputed_Data)

get complete data ( 2nd out of 5)

completeData <- complete(imputed_Data,2)

completeData<-cbind(completeData,irisData[5])

colnames(completeData)<-c(“sepal_length”,“sepal_width”,“petal_length”,“petal_width”,“class”)

#### DIVIDING DATA INTO TESTING AND TRAINING

index <- sample(1:nrow(completeData),size=nrow(completeData)*0.7)

train<- completeData[index,]

View(train)

test <- completeData[-index,]

View(test)

pairs(train)

#### decision tree based gradient boosting model ###############

model <- gbm(class ~ petal_length+sepal_length + sepal_width + petal_width, data=train, n.trees=1000, interaction.depth=2, distribution=“multinomial”)

summary(model)##important variables(petal_length,petal_width)

prediction

prediction <- predict.gbm(model, test[-5], type=“response”, n.trees=1000)

prediction_round<-data.frame(round(prediction, 3))

colnames(prediction_round)<-c(“Iris-setosa”,“Iris-versicolor”,“Iris-virginica”) prediction_round$sc<-apply(prediction_round,1,max)

prediction_round$category<-names(prediction_round)[max.col(prediction_round,ties.method=“first”)]

evaluation

cm1<-table(test[,5],prediction_round[,5])

print(cm1)

1-sum(diag(cm1)/sum(cm1))

library(caret)

confusionMatrix(cm1)## accuracy 93%

———————————————————————————————————————————————————————– ### multinomial logistic regression ####

library(nnet)

logistic<-multinom(class ~.,data=train)

logistic

prediction

prediction_new<-data.frame(predict(logistic,newdata=test[,-5],type=“class”))

evaluation

cm2<-table(test[,5],prediction_new[,1])

print(cm2)

1-sum(diag(cm2)/sum(cm2))

confusionMatrix(cm2)## accuracy=93%

structure of data(types of variables)

univariate analysis

Pearson’s Chi-squared test(to know the relation between variables(H0: null hypothesis is no relation, to reject that we need we need to get p<0.05))

##chisq.test(irisData\(sepal.length.in.cm.,irisData\)class)

#### OUTLIER DETECTION AND IMPUTATION ####

##for missing values that lie outside the 1.5 * IQR limits, we could cap it by replacing those observations

outside the lower limit with the value of 5th %ile and those that lie above the upper limit, with the value of 95th %ile.

Generate 10% missing values at Random

Check missing values introduced in the data

encoding categorical variable with dummy.code() in psych

library(“psych”)

new <- dummy.code(iris.mis$class)

new.sat <- data.frame(new,iris.mis)

new.sat<-new.sat[,-8]

round(cor(new.sat,use=“pairwise”),2)

install MIC

md.pattern() returns a tabular form of missing value present in each variable in a data set.

plotting missing values.

imputing missing values

get complete data ( 2nd out of 5)

prediction

evaluation

———————————————————————————————————————————————————————– ### multinomial logistic regression ####

prediction

evaluation