About Data

Reading the data as Data Frame

df <- read.csv2("C:/Users/anilk/OneDrive - SRH IT/Analytics-2/Input_data/lda_ex.csv")
head(df)
##   group security ecology innovation prestige
## 1     A        8       6          8       10
## 2     A        7       7          8        9
## 3     A        8       6          9        9
## 4     A        9       5          7        9
## 5     A       10       8          9       10
## 6     A        7       5          6        8
str(df)
## 'data.frame':    45 obs. of  5 variables:
##  $ group     : chr  "A" "A" "A" "A" ...
##  $ security  : int  8 7 8 9 10 7 9 8 7 8 ...
##  $ ecology   : int  6 7 6 5 8 5 7 7 6 7 ...
##  $ innovation: int  8 8 9 7 9 6 10 9 7 8 ...
##  $ prestige  : int  10 9 9 9 10 8 9 9 9 8 ...
df <- subset(df , group != "C")
df$group <- factor(df$group)
str(df)
## 'data.frame':    30 obs. of  5 variables:
##  $ group     : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
##  $ security  : int  8 7 8 9 10 7 9 8 7 8 ...
##  $ ecology   : int  6 7 6 5 8 5 7 7 6 7 ...
##  $ innovation: int  8 8 9 7 9 6 10 9 7 8 ...
##  $ prestige  : int  10 9 9 9 10 8 9 9 9 8 ...
#df <- droplevels(df)
#str(df)
summary(df)
##  group     security         ecology         innovation        prestige     
##  A:15   Min.   : 7.000   Min.   : 5.000   Min.   : 4.000   Min.   : 5.000  
##  B:15   1st Qu.: 8.000   1st Qu.: 6.250   1st Qu.: 5.000   1st Qu.: 6.000  
##         Median : 8.000   Median : 7.500   Median : 6.000   Median : 6.500  
##         Mean   : 8.433   Mean   : 7.367   Mean   : 6.633   Mean   : 7.133  
##         3rd Qu.: 9.000   3rd Qu.: 8.000   3rd Qu.: 8.000   3rd Qu.: 9.000  
##         Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000

Exploratory data Analysis :-

Numerical variables :-

plot_box <- function(attr){
  colname <- eval(parse(text=paste('df$' ,attr , sep = '' )))
  boxplot(colname , horizontal = TRUE)
  remove(colname)
}

plot_hist <- function(attr){
  colname <- eval(parse(text=paste('df$' ,attr , sep = '' )))
  sub_text <- paste('Mean :' , round (mean(colname) , 2) ,
                    '\nMedian:', round (median(colname) , 2) ,
                    '\nStd. Dev:', round (sd(colname) , 2)
                    )
  hist(colname , main = attr , labels = TRUE , xlab = '' , sub=sub_text )
}


par(mfrow=c(4,2))

plot_box('security')
plot_box('ecology')
plot_hist('security')
plot_hist('ecology')


plot_box('innovation')
plot_box('prestige')
plot_hist('innovation')
plot_hist('prestige')

remove(plot_box , plot_hist)

Pair plots :-

require(psych)
## Loading required package: psych
pairs.panels(df[2:5],
              gap = 0,
              bg = c('red' , 'green' , 'blue' , 'black')[df$group],
              pch=21)

Numerical variables in various categories :-

par(mfrow=c(2,2))
boxplot(df$security ~ df$group , xlab = 'GROUP' , ylab = 'SECURITY' , main = "SECURITY")
boxplot(df$ecology ~ df$group , xlab = 'GROUP' , ylab = 'ECOLOGY' , main = "ECOLOGY")
boxplot(df$innovation ~ df$group , xlab = 'GROUP' , ylab = 'INNOVAITON' , main = "INNOVAITON")
boxplot(df$prestige ~ df$group , xlab = 'GROUP' , ylab = 'PRESTIGE' , main = "PRESTIGE")

## Fitting LDA :-

require(MASS)
## Loading required package: MASS
fit <- lda(group ~ . , data=df)
fit
## Call:
## lda(group ~ ., data = df)
## 
## Prior probabilities of groups:
##   A   B 
## 0.5 0.5 
## 
## Group means:
##   security  ecology innovation prestige
## A 8.400000 6.533333   7.933333 8.666667
## B 8.466667 8.200000   5.333333 5.600000
## 
## Coefficients of linear discriminants:
##                    LD1
## security    0.02473145
## ecology     0.48227799
## innovation -0.30381323
## prestige   -0.90305578
plot(fit , type="both")

p <- predict(fit)
head(p)
## $class
##  [1] A A A A A A A A A A A B A A A B B B B B B B B B B B B B B B
## Levels: A B
## 
## $posterior
##               A            B
## 1  9.999999e-01 1.086477e-07
## 2  9.999588e-01 4.122222e-05
## 3  9.999985e-01 1.485701e-06
## 4  9.999971e-01 2.860349e-06
## 5  9.999976e-01 2.411406e-06
## 6  9.995531e-01 4.468691e-04
## 7  9.999964e-01 3.606657e-06
## 8  9.999878e-01 1.219334e-05
## 9  9.999811e-01 1.891677e-05
## 10 9.976405e-01 2.359483e-03
## 11 9.999907e-01 9.322778e-06
## 12 3.262928e-04 9.996737e-01
## 13 9.999931e-01 6.943712e-06
## 14 9.982416e-01 1.758425e-03
## 15 9.993724e-01 6.275789e-04
## 16 3.239822e-06 9.999968e-01
## 17 8.602414e-07 9.999991e-01
## 18 3.262928e-04 9.996737e-01
## 19 8.819030e-04 9.991181e-01
## 20 1.367531e-03 9.986325e-01
## 21 3.239822e-06 9.999968e-01
## 22 1.381097e-05 9.999862e-01
## 23 4.267193e-04 9.995733e-01
## 24 5.689226e-06 9.999943e-01
## 25 1.227771e-03 9.987722e-01
## 26 1.668258e-04 9.998332e-01
## 27 1.668258e-04 9.998332e-01
## 28 9.389979e-04 9.990610e-01
## 29 2.929150e-04 9.997071e-01
## 30 7.864905e-06 9.999921e-01
## 
## $x
##          LD1
## 1  -3.673802
## 2  -2.313199
## 3  -3.074559
## 4  -2.924479
## 5  -2.963596
## 6  -1.767073
## 7  -2.871363
## 8  -2.592281
## 9  -2.491664
## 10 -1.385412
## 11 -2.653781
## 12  1.839149
## 13 -2.721283
## 14 -1.452914
## 15 -1.689225
## 16  2.895938
## 17  3.199751
## 18  1.839149
## 19  1.611221
## 20  1.510604
## 21  2.895938
## 22  2.563740
## 23  1.777649
## 24  2.766936
## 25  1.535335
## 26  1.992882
## 27  1.992882
## 28  1.596836
## 29  1.863880
## 30  2.692742

Bi-Plot :-

#require(devtools)
#install_github("fawda123/ggord")
#require(ggord)
#ggord(fit , df$group)

Partition Plot :-

require(klaR)
## Loading required package: klaR
partimat(group~. , data = df , method = "lda")

df$Predicted_group <- p$class
df$prob_A <- p$posterior[,1]
df$prob_B <- p$posterior[,2]
df$descriment_values <- p$x[,1]
remove(p)
head(df)
##   group security ecology innovation prestige Predicted_group    prob_A
## 1     A        8       6          8       10               A 0.9999999
## 2     A        7       7          8        9               A 0.9999588
## 3     A        8       6          9        9               A 0.9999985
## 4     A        9       5          7        9               A 0.9999971
## 5     A       10       8          9       10               A 0.9999976
## 6     A        7       5          6        8               A 0.9995531
##         prob_B descriment_values
## 1 1.086477e-07         -3.673802
## 2 4.122222e-05         -2.313199
## 3 1.485701e-06         -3.074559
## 4 2.860349e-06         -2.924479
## 5 2.411406e-06         -2.963596
## 6 4.468691e-04         -1.767073

Evaluation of LDA :-

Confusion Matrix :-

f <- table(df$group , df$Predicted_group)
f
##    
##      A  B
##   A 14  1
##   B  0 15

Correct Classificaton score :-

round ( (sum(diag(f)) / sum(f))*100 , 2 )
## [1] 96.67
remove(f)

Predicted for new data :-

nd <- data.frame(security=c(6, 5), ecology=c(8, 5),
innovation=c(7, 6), prestige=c(8, 5))

predict(fit , newdata = nd)
## $class
## [1] A B
## Levels: A B
## 
## $posterior
##            A          B
## 1 0.94436846 0.05563154
## 2 0.01991636 0.98008364
## 
## $x
##          LD1
## 1 -0.6487837
## 2  0.8926315