library(ggplot2)
df <- read.csv('https://raw.githubusercontent.com/RWorkshop/Statistics-With-R-Workshop/master/data/socrHW.csv')
# Calculate Mahalanobis Distance with height and weight distributions
m_dist <- mahalanobis(df[, 1:2], colMeans(df[, 1:2]), cov(df[, 1:2]))
df$m_dist <- round(m_dist, 2)
# Mahalanobis Outliers - Threshold set to 12
df$outlier_maha <- "No"
df$outlier_maha[df$m_dist > 12] <- "Yes"
head(df)
## Index Height Weight m_dist outlier_maha
## 1 1 65.78331 112.9925 4.32 No
## 2 2 71.51521 136.4873 6.46 No
## 3 3 69.39874 153.0269 3.56 No
## 4 4 68.21660 142.3354 3.01 No
## 5 5 67.78781 144.2971 3.01 No
## 6 6 68.69784 123.3024 3.14 No
# Scatterplot with Maha Outliers
p<- ggplot(df, aes(x = Weight, y = Height, color = outlier_maha)) +
geom_point(size = 5, alpha = 0.6) +
labs(title = "Weight vs Height",
subtitle = "Outlier Detection in weight vs height data - Using Mahalanobis Distances",
caption = "Source: http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_Dinov_020108_HeightsWeights") +
ylab("Height in cm") + xlab("Weight in kg") +
scale_y_continuous(breaks = seq(160, 200, 5)) +
scale_x_continuous(breaks = seq(35, 80, 5))
p
