One of the assumptions about LDA is that the predictors are Gaussian (normal) and come from a multivariate distribution with a common covariance matrix.
This is an attempt to test the impact of dissimilar covariance across the classes on overall accuracy of predictions.
# A function to generate 2 random Gaussian independent variables
# Params: Covariance between each variable (x1 and x2)
getData <- function(c1 = 1, c2 = 1){
# Create covariance matrix
cm1 <- matrix(c(1,c1,c1,1),ncol=2)
cm2 <- matrix(c(1,c2,c2,1),ncol=2)
# Generate Gaussian data with the given covariance
temp1 <- as.data.frame(rmvnorm(500,c(5,8),sigma=cm1))
temp2 <- as.data.frame(rmvnorm(500,c(3,6),sigma=cm2))
# Add column names
colnames(temp1) <- c("x1","x2")
colnames(temp2) <- c("x1","x2")
# Add in outcome variables
temp1$outcome <- as.factor("A")
temp2$outcome <- as.factor("B")
return(rbind(temp1,temp2))
}
We will generate 4 sets of data with increasingly different covariances
set.seed(4321)
# Generate data sets
data1 <- getData(0.21,0.24)
data2 <- getData(0.21,0.49)
data3 <- getData(0.28,0.68)
data4 <- getData(0.28,0.87)
# Split into train and test
split1 <- createDataPartition(data1$outcome,p=0.7,list=F)
split2 <- createDataPartition(data2$outcome,p=0.7,list=F)
split3 <- createDataPartition(data3$outcome,p=0.7,list=F)
split4 <- createDataPartition(data4$outcome,p=0.7,list=F)
# Plot the training data
data1[split1,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
labs(title="Data 1", subtitle="Training Data")
data2[split2,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
labs(title="Data 2", subtitle="Training Data")
data3[split3,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
labs(title="Data 3", subtitle="Training Data")
data4[split4,] %>% ggplot(aes(x=x1,y=x2, col=outcome)) + geom_point() +
labs(title="Data 4", subtitle="Training Data")
Now we will train the LDA models (1 for each dataset) and look at the accuracy measures for each on their respective test sets.
# Train LDA models
fit1 <- lda(outcome ~ ., data = data1[split1,])
fit2 <- lda(outcome ~ ., data = data2[split2,])
fit3 <- lda(outcome ~ ., data = data3[split3,])
fit4 <- lda(outcome ~ ., data = data4[split4,])
# Predict on Test set
pred1 <- predict(fit1, newdata=data1[-split1,])
pred2 <- predict(fit2, newdata=data1[-split2,])
pred3 <- predict(fit3, newdata=data1[-split3,])
pred4 <- predict(fit4, newdata=data1[-split4,])
# Output accuracy measure for each prediction set
measures <- tibble(Dataset = 1, Accuracy = confusionMatrix(table(prediction = pred1$class, actual = data1[-split1,]$outcome))$overall["Accuracy"])
measures <- rbind(measures,tibble(Dataset = 2, Accuracy = confusionMatrix(table(prediction = pred2$class, actual = data2[-split2,]$outcome))$overall["Accuracy"]))
measures <- rbind(measures,tibble(Dataset = 3, Accuracy = confusionMatrix(table(prediction = pred3$class, actual = data3[-split3,]$outcome))$overall["Accuracy"]))
measures <- rbind(measures,tibble(Dataset = 4, Accuracy = confusionMatrix(table(prediction = pred4$class, actual = data4[-split4,]$outcome))$overall["Accuracy"]))
measures %>% kable() %>%
kable_styling(bootstrap_options = "striped", full_width=F)
| Dataset | Accuracy |
|---|---|
| 1 | 0.9100000 |
| 2 | 0.9166667 |
| 3 | 0.9066667 |
| 4 | 0.9033333 |