Linear Discriminant Analysis

One of the assumptions about LDA is that the predictors are Gaussian (normal) and come from a multivariate distribution with a common covariance matrix.

This is an attempt to test the impact of dissimilar covariance across the classes on overall accuracy of predictions.

# A function to generate 2 random Gaussian independent variables
# Params: Covariance between each variable (x1 and x2)

getData <- function(c1 = 1, c2 = 1){
  # Create covariance matrix
  cm1 <- matrix(c(1,c1,c1,1),ncol=2)
  cm2 <- matrix(c(1,c2,c2,1),ncol=2)
  
  # Generate Gaussian data with the given covariance
  temp1 <- as.data.frame(rmvnorm(500,c(5,8),sigma=cm1))
  temp2 <- as.data.frame(rmvnorm(500,c(3,6),sigma=cm2))
  
  # Add column names
  colnames(temp1) <- c("x1","x2")
  colnames(temp2) <- c("x1","x2")
  
  # Add in outcome variables
  temp1$outcome <- as.factor("A")
  temp2$outcome <- as.factor("B")
  
  return(rbind(temp1,temp2))
}

Generate Data

We will generate 4 sets of data with increasingly different covariances

set.seed(4321)

# Generate data sets
data1 <- getData(0.21,0.24)
data2 <- getData(0.21,0.49)
data3 <- getData(0.28,0.68)
data4 <- getData(0.28,0.87)

# Split into train and test
split1 <- createDataPartition(data1$outcome,p=0.7,list=F)
split2 <- createDataPartition(data2$outcome,p=0.7,list=F)
split3 <- createDataPartition(data3$outcome,p=0.7,list=F)
split4 <- createDataPartition(data4$outcome,p=0.7,list=F)

# Plot the training data

data1[split1,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
  labs(title="Data 1", subtitle="Training Data")

data2[split2,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
  labs(title="Data 2", subtitle="Training Data")

data3[split3,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
  labs(title="Data 3", subtitle="Training Data")

data4[split4,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
  labs(title="Data 4", subtitle="Training Data")

Train Models

Now we will train the LDA models (1 for each dataset) and look at the accuracy measures for each on their respective test sets.

# Train LDA models
fit1 <- lda(outcome ~ ., data = data1[split1,])
fit2 <- lda(outcome ~ ., data = data2[split2,])
fit3 <- lda(outcome ~ ., data = data3[split3,])
fit4 <- lda(outcome ~ ., data = data4[split4,])

# Predict on Test set
pred1 <- predict(fit1, newdata=data1[-split1,])
pred2 <- predict(fit2, newdata=data1[-split2,])
pred3 <- predict(fit3, newdata=data1[-split3,])
pred4 <- predict(fit4, newdata=data1[-split4,])

# Output accuracy measure for each prediction set
measures <- tibble(Dataset = 1, Accuracy = confusionMatrix(table(prediction = pred1$class, actual = data1[-split1,]$outcome))$overall["Accuracy"])

measures <- rbind(measures,tibble(Dataset = 2, Accuracy = confusionMatrix(table(prediction = pred2$class, actual = data2[-split2,]$outcome))$overall["Accuracy"]))

measures <- rbind(measures,tibble(Dataset = 3, Accuracy = confusionMatrix(table(prediction = pred3$class, actual = data3[-split3,]$outcome))$overall["Accuracy"]))

measures <- rbind(measures,tibble(Dataset = 4, Accuracy = confusionMatrix(table(prediction = pred4$class, actual = data4[-split4,]$outcome))$overall["Accuracy"]))

measures %>% kable() %>%
  kable_styling(bootstrap_options = "striped", full_width=F)
Dataset Accuracy
1 0.9100000
2 0.9166667
3 0.9066667
4 0.9033333