library('readxl')
## Warning: package 'readxl' was built under R version 4.4.3
file_path <- "C:/Users/nickw/OneDrive - University of Cincinnati/Desktop/BANA 7051/Module 1 - Exploratory Data Analysis/winequality-red.csv"
wine_data <- read.csv(file_path, sep = ";", check.names = FALSE)
head(wine_data)
## fixed acidity volatile acidity citric acid residual sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
cat("Sample Size:", nrow(wine_data))
## Sample Size: 1599
# We have a sample size of 1,599
cat("Summary of all variables:\n")
## Summary of all variables:
summary(wine_data)
## fixed acidity volatile acidity citric acid residual sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free sulfur dioxide total sulfur dioxide density
## Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
## Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
## Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
## Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
## pH sulphates alcohol quality
## Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
## 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.310 Median :0.6200 Median :10.20 Median :6.000
## Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
## 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
sapply(wine_data, sd)
## fixed acidity volatile acidity citric acid
## 1.741096318 0.179059704 0.194801137
## residual sugar chlorides free sulfur dioxide
## 1.409928060 0.047065302 10.460156970
## total sulfur dioxide density pH
## 32.895324478 0.001887334 0.154386465
## sulphates alcohol quality
## 0.169506980 1.065667582 0.807569440
# Let's check how many outliers there are for each variable by counting the number of points beyond 2 standard deviations
outlier_counts <- sapply(wine_data[, sapply(wine_data, is.numeric)], function(x) {
mean_val <- mean(x, na.rm = TRUE)
sd_val <- sd(x, na.rm = TRUE)
sum(x > (mean_val + 2 * sd_val) | x < (mean_val - 2 * sd_val), na.rm = TRUE)
})
print(outlier_counts)
## fixed acidity volatile acidity citric acid
## 80 56 35
## residual sugar chlorides free sulfur dioxide
## 75 45 66
## total sulfur dioxide density pH
## 80 81 75
## sulphates alcohol quality
## 59 70 81
# We have anywhere from 35 to 81 outliers in each column
# Setting up a 4x3 grid for the plots
par(mfrow = c(4, 3))
# Create a histogram for each column in the dataframe
for (col in names(wine_data)) {
hist(
wine_data[[col]],
main = paste("Distribution of", col),
xlab = col,
col = "skyblue",
border = "black"
)
}
# It looks like there are multiple variables that are skewed like fixed acidity, free sulfur dioxide, total sulfur dioxide, sulphates, and alcohol"
# Reset the plot layout to the default of one plot per window
par(mfrow = c(1, 1))
# Let's take a closer look at the distribution of alcohol
hist(
wine_data$alcohol,
main = "Distribution of Alcohol (Right-Skewed)",
xlab = "Alcohol",
col = "coral",
border = "black"
)
# Now let's look at boxplots of each variable to better visulize outliers
par(mfrow = c(4, 3))
for (col in names(wine_data)) {
boxplot(
wine_data[[col]],
main = paste("Boxplot of", col),
ylab = col,
col = "lightgreen"
)
}