Problem Setup

Packages Used
library('readxl')
## Warning: package 'readxl' was built under R version 4.4.3
Import the data set
file_path <- "C:/Users/nickw/OneDrive - University of Cincinnati/Desktop/BANA 7051/Module 1 - Exploratory Data Analysis/winequality-red.csv"
Give the variables their own rows by seperating them where there’s a semicolon
wine_data <- read.csv(file_path, sep = ";", check.names = FALSE)
head(wine_data)
##   fixed acidity volatile acidity citric acid residual sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free sulfur dioxide total sulfur dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality
## 1       5
## 2       5
## 3       5
## 4       6
## 5       5
## 6       5

Gathering Information on the Data

Sample size
cat("Sample Size:", nrow(wine_data))
## Sample Size: 1599
# We have a sample size of 1,599
Summarizing the data
cat("Summary of all variables:\n")
## Summary of all variables:
summary(wine_data)
##  fixed acidity   volatile acidity  citric acid    residual sugar  
##  Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900  
##  1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900  
##  Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200  
##  Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539  
##  3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600  
##  Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500  
##    chlorides       free sulfur dioxide total sulfur dioxide    density      
##  Min.   :0.01200   Min.   : 1.00       Min.   :  6.00       Min.   :0.9901  
##  1st Qu.:0.07000   1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956  
##  Median :0.07900   Median :14.00       Median : 38.00       Median :0.9968  
##  Mean   :0.08747   Mean   :15.87       Mean   : 46.47       Mean   :0.9967  
##  3rd Qu.:0.09000   3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978  
##  Max.   :0.61100   Max.   :72.00       Max.   :289.00       Max.   :1.0037  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.740   Min.   :0.3300   Min.   : 8.40   Min.   :3.000  
##  1st Qu.:3.210   1st Qu.:0.5500   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.310   Median :0.6200   Median :10.20   Median :6.000  
##  Mean   :3.311   Mean   :0.6581   Mean   :10.42   Mean   :5.636  
##  3rd Qu.:3.400   3rd Qu.:0.7300   3rd Qu.:11.10   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :8.000
Standard Deviation of each variable
sapply(wine_data, sd)
##        fixed acidity     volatile acidity          citric acid 
##          1.741096318          0.179059704          0.194801137 
##       residual sugar            chlorides  free sulfur dioxide 
##          1.409928060          0.047065302         10.460156970 
## total sulfur dioxide              density                   pH 
##         32.895324478          0.001887334          0.154386465 
##            sulphates              alcohol              quality 
##          0.169506980          1.065667582          0.807569440
# Let's check how many outliers there are for each variable by counting the number of points beyond 2 standard deviations
outlier_counts <- sapply(wine_data[, sapply(wine_data, is.numeric)], function(x) {
  mean_val <- mean(x, na.rm = TRUE)
  sd_val <- sd(x, na.rm = TRUE)
  sum(x > (mean_val + 2 * sd_val) | x < (mean_val - 2 * sd_val), na.rm = TRUE)
})
print(outlier_counts)
##        fixed acidity     volatile acidity          citric acid 
##                   80                   56                   35 
##       residual sugar            chlorides  free sulfur dioxide 
##                   75                   45                   66 
## total sulfur dioxide              density                   pH 
##                   80                   81                   75 
##            sulphates              alcohol              quality 
##                   59                   70                   81
# We have anywhere from 35 to 81 outliers in each column

Generate Plots of the Data

# Setting up a 4x3 grid for the plots
par(mfrow = c(4, 3))

# Create a histogram for each column in the dataframe
for (col in names(wine_data)) {
  hist(
    wine_data[[col]],
    main = paste("Distribution of", col),
    xlab = col,
    col = "skyblue",
    border = "black"
  )
}

# It looks like there are multiple variables that are skewed like fixed acidity, free sulfur dioxide, total sulfur dioxide, sulphates, and alcohol"
# Reset the plot layout to the default of one plot per window
par(mfrow = c(1, 1))

# Let's take a closer look at the distribution of alcohol
hist(
  wine_data$alcohol,
  main = "Distribution of Alcohol (Right-Skewed)",
  xlab = "Alcohol",
  col = "coral",
  border = "black"
)

# Now let's look at boxplots of each variable to better visulize outliers
par(mfrow = c(4, 3))
for (col in names(wine_data)) {
  boxplot(
    wine_data[[col]],
    main = paste("Boxplot of", col),
    ylab = col,
    col = "lightgreen"
  )
}