Instructions

This is role playing. I am your new boss. I am in charge of production at ABC Beverage and you are a team of data scientists reporting to me. My leadership has told me that new regulations are requiring us to understand our manufacturing process, the predictive factors and be able to report to them our predictive model of PH.

Please use the historical data set I am providing. Build and report the factors in BOTH a technical and non-technical report. I like to use Word and Excel. Please provide your non-technical report in a business friendly readable document and your predictions in an Excel readable format. The technical report should show clearly the models you tested and how you selected your final approach.

Please submit both Rpubs links and .rmd files or other readable formats for technical and non-technical reports. Also submit the excel file showing the prediction of your models for pH.

We start by loading relevant libraries for data manipulation, visualization, imputation, and modeling.

# Load required libraries
library(tidyverse)    #
library(caret)       
library(mice)         
library(kableExtra)   
library(corrplot)     
library(randomForest) 
library(gbm)          
library(nnet)         
library(Cubist)       
library(openxlsx)     
library(ggpubr)       
library(viridis)      
library(hrbrthemes)   
library(e1071)        
library(DT)

Load datasets

Load datasets and substitute any empty values with NA values to facilitate the imputation of missing data in the future.

df_StudentData <- read.csv('https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/StudentData.csv', na.strings = c("", NA))
df_EvalData <- read.csv('https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/StudentEvaluation.csv', na.strings = c("", NA))

#Check first rows of beverage data
DT::datatable(
      df_StudentData[1:10,],
      options = list(scrollX = TRUE,
                     deferRender = TRUE,
                     dom = 'lBfrtip',
                     fixedColumns = TRUE, 
                     info = FALSE,  
                     paging=FALSE,
                     searching = FALSE), 
      rownames = FALSE,
      caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-size: 16px; font-weight: bold;',
    'Table 1: First 10 Rows of Beverage Data'
  ))

DT::datatable(
      df_EvalData[1:10,],
      options = list(scrollX = TRUE,
                     deferRender = TRUE,
                     dom = 'lBfrtip',
                     fixedColumns = TRUE, 
                     info = FALSE,      
                     paging=FALSE,
                     searching = FALSE), 
      rownames = FALSE,
      caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-size: 16px; font-weight: bold;',
    'Table 2: First 10 Rows of Evaluation Data'
  ))

Check dataset dimensions and glimpse for quick overview

# Finding data dimensions.
dims <- data.frame("Train" =  dim(df_StudentData),
                   "Eval" = dim(df_EvalData))
rownames(dims) <- c("Observations","Predictors")
dims

##              Train Eval
## Observations  2571  267
## Predictors      33   33

The Training set contains a total of 2,571 observations and 33 predictors, including PH, as shown in the table above. Additionally, the Evaluation set consists of 267 observations, also with 33 predictors, including PH.

glimpse(df_StudentData)

## Rows: 2,571
## Columns: 33
## $ Brand.Code        <chr> "B", "A", "B", "A", "A", "A", "A", "B", "B", "B", "B…
## $ Carb.Volume       <dbl> 5.340000, 5.426667, 5.286667, 5.440000, 5.486667, 5.…
## $ Fill.Ounces       <dbl> 23.96667, 24.00667, 24.06000, 24.00667, 24.31333, 23…
## $ PC.Volume         <dbl> 0.2633333, 0.2386667, 0.2633333, 0.2933333, 0.111333…
## $ Carb.Pressure     <dbl> 68.2, 68.4, 70.8, 63.0, 67.2, 66.6, 64.2, 67.6, 64.2…
## $ Carb.Temp         <dbl> 141.2, 139.6, 144.8, 132.6, 136.8, 138.4, 136.8, 141…
## $ PSC               <dbl> 0.104, 0.124, 0.090, NA, 0.026, 0.090, 0.128, 0.154,…
## $ PSC.Fill          <dbl> 0.26, 0.22, 0.34, 0.42, 0.16, 0.24, 0.40, 0.34, 0.12…
## $ PSC.CO2           <dbl> 0.04, 0.04, 0.16, 0.04, 0.12, 0.04, 0.04, 0.04, 0.14…
## $ Mnf.Flow          <dbl> -100, -100, -100, -100, -100, -100, -100, -100, -100…
## $ Carb.Pressure1    <dbl> 118.8, 121.6, 120.2, 115.2, 118.4, 119.6, 122.2, 124…
## $ Fill.Pressure     <dbl> 46.0, 46.0, 46.0, 46.4, 45.8, 45.6, 51.8, 46.8, 46.0…
## $ Hyd.Pressure1     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Hyd.Pressure2     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Hyd.Pressure3     <dbl> NA, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Hyd.Pressure4     <int> 118, 106, 82, 92, 92, 116, 124, 132, 90, 108, 94, 86…
## $ Filler.Level      <dbl> 121.2, 118.6, 120.0, 117.8, 118.6, 120.2, 123.4, 118…
## $ Filler.Speed      <int> 4002, 3986, 4020, 4012, 4010, 4014, NA, 1004, 4014, …
## $ Temperature       <dbl> 66.0, 67.6, 67.0, 65.6, 65.6, 66.2, 65.8, 65.2, 65.4…
## $ Usage.cont        <dbl> 16.18, 19.90, 17.76, 17.42, 17.68, 23.82, 20.74, 18.…
## $ Carb.Flow         <int> 2932, 3144, 2914, 3062, 3054, 2948, 30, 684, 2902, 3…
## $ Density           <dbl> 0.88, 0.92, 1.58, 1.54, 1.54, 1.52, 0.84, 0.84, 0.90…
## $ MFR               <dbl> 725.0, 726.8, 735.0, 730.6, 722.8, 738.8, NA, NA, 74…
## $ Balling           <dbl> 1.398, 1.498, 3.142, 3.042, 3.042, 2.992, 1.298, 1.2…
## $ Pressure.Vacuum   <dbl> -4.0, -4.0, -3.8, -4.4, -4.4, -4.4, -4.4, -4.4, -4.4…
## $ PH                <dbl> 8.36, 8.26, 8.94, 8.24, 8.26, 8.32, 8.40, 8.38, 8.38…
## $ Oxygen.Filler     <dbl> 0.022, 0.026, 0.024, 0.030, 0.030, 0.024, 0.066, 0.0…
## $ Bowl.Setpoint     <int> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 12…
## $ Pressure.Setpoint <dbl> 46.4, 46.8, 46.6, 46.0, 46.0, 46.0, 46.0, 46.0, 46.0…
## $ Air.Pressurer     <dbl> 142.6, 143.0, 142.0, 146.2, 146.2, 146.6, 146.2, 146…
## $ Alch.Rel          <dbl> 6.58, 6.56, 7.66, 7.14, 7.14, 7.16, 6.54, 6.52, 6.52…
## $ Carb.Rel          <dbl> 5.32, 5.30, 5.84, 5.42, 5.44, 5.44, 5.38, 5.34, 5.34…
## $ Balling.Lvl       <dbl> 1.48, 1.56, 3.28, 3.04, 3.04, 3.02, 1.44, 1.44, 1.44…

The dataset comprises 33 variables and 2,571 observations. Among these, the Brand.Code variable is categorical, while the remaining variables are either integers or numerical. Some variables appear to be skewed, suggesting that centering and scaling may be beneficial in subsequent analysis.

summary(df_StudentData) %>%
  kable() %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "250px")

Brand.Code	Carb.Volume	Fill.Ounces	PC.Volume	Carb.Pressure	Carb.Temp	PSC	PSC.Fill	PSC.CO2	Mnf.Flow	Carb.Pressure1	Fill.Pressure	Hyd.Pressure1	Hyd.Pressure2	Hyd.Pressure3	Hyd.Pressure4	Filler.Level	Filler.Speed	Temperature	Usage.cont	Carb.Flow	Density	MFR	Balling	Pressure.Vacuum	PH	Oxygen.Filler	Bowl.Setpoint	Pressure.Setpoint	Air.Pressurer	Alch.Rel	Carb.Rel	Balling.Lvl
Length:2571	Min. :5.040	Min. :23.63	Min. :0.07933	Min. :57.00	Min. :128.6	Min. :0.00200	Min. :0.0000	Min. :0.00000	Min. :-100.20	Min. :105.6	Min. :34.60	Min. :-0.80	Min. : 0.00	Min. :-1.20	Min. : 52.00	Min. : 55.8	Min. : 998	Min. :63.60	Min. :12.08	Min. : 26	Min. :0.240	Min. : 31.4	Min. :-0.170	Min. :-6.600	Min. :7.880	Min. :0.00240	Min. : 70.0	Min. :44.00	Min. :140.8	Min. :5.280	Min. :4.960	Min. :0.00
Class :character	1st Qu.:5.293	1st Qu.:23.92	1st Qu.:0.23917	1st Qu.:65.60	1st Qu.:138.4	1st Qu.:0.04800	1st Qu.:0.1000	1st Qu.:0.02000	1st Qu.:-100.00	1st Qu.:119.0	1st Qu.:46.00	1st Qu.: 0.00	1st Qu.: 0.00	1st Qu.: 0.00	1st Qu.: 86.00	1st Qu.: 98.3	1st Qu.:3888	1st Qu.:65.20	1st Qu.:18.36	1st Qu.:1144	1st Qu.:0.900	1st Qu.:706.3	1st Qu.: 1.496	1st Qu.:-5.600	1st Qu.:8.440	1st Qu.:0.02200	1st Qu.:100.0	1st Qu.:46.00	1st Qu.:142.2	1st Qu.:6.540	1st Qu.:5.340	1st Qu.:1.38
Mode :character	Median :5.347	Median :23.97	Median :0.27133	Median :68.20	Median :140.8	Median :0.07600	Median :0.1800	Median :0.04000	Median : 65.20	Median :123.2	Median :46.40	Median :11.40	Median :28.60	Median :27.60	Median : 96.00	Median :118.4	Median :3982	Median :65.60	Median :21.79	Median :3028	Median :0.980	Median :724.0	Median : 1.648	Median :-5.400	Median :8.540	Median :0.03340	Median :120.0	Median :46.00	Median :142.6	Median :6.560	Median :5.400	Median :1.48
NA	Mean :5.370	Mean :23.97	Mean :0.27712	Mean :68.19	Mean :141.1	Mean :0.08457	Mean :0.1954	Mean :0.05641	Mean : 24.57	Mean :122.6	Mean :47.92	Mean :12.44	Mean :20.96	Mean :20.46	Mean : 96.29	Mean :109.3	Mean :3687	Mean :65.97	Mean :20.99	Mean :2468	Mean :1.174	Mean :704.0	Mean : 2.198	Mean :-5.216	Mean :8.546	Mean :0.04684	Mean :109.3	Mean :47.62	Mean :142.8	Mean :6.897	Mean :5.437	Mean :2.05
NA	3rd Qu.:5.453	3rd Qu.:24.03	3rd Qu.:0.31200	3rd Qu.:70.60	3rd Qu.:143.8	3rd Qu.:0.11200	3rd Qu.:0.2600	3rd Qu.:0.08000	3rd Qu.: 140.80	3rd Qu.:125.4	3rd Qu.:50.00	3rd Qu.:20.20	3rd Qu.:34.60	3rd Qu.:33.40	3rd Qu.:102.00	3rd Qu.:120.0	3rd Qu.:3998	3rd Qu.:66.40	3rd Qu.:23.75	3rd Qu.:3186	3rd Qu.:1.620	3rd Qu.:731.0	3rd Qu.: 3.292	3rd Qu.:-5.000	3rd Qu.:8.680	3rd Qu.:0.06000	3rd Qu.:120.0	3rd Qu.:50.00	3rd Qu.:143.0	3rd Qu.:7.240	3rd Qu.:5.540	3rd Qu.:3.14
NA	Max. :5.700	Max. :24.32	Max. :0.47800	Max. :79.40	Max. :154.0	Max. :0.27000	Max. :0.6200	Max. :0.24000	Max. : 229.40	Max. :140.2	Max. :60.40	Max. :58.00	Max. :59.40	Max. :50.00	Max. :142.00	Max. :161.2	Max. :4030	Max. :76.20	Max. :25.90	Max. :5104	Max. :1.920	Max. :868.6	Max. : 4.012	Max. :-3.600	Max. :9.360	Max. :0.40000	Max. :140.0	Max. :52.00	Max. :148.2	Max. :8.620	Max. :6.060	Max. :3.66
NA	NA’s :10	NA’s :38	NA’s :39	NA’s :27	NA’s :26	NA’s :33	NA’s :23	NA’s :39	NA’s :2	NA’s :32	NA’s :22	NA’s :11	NA’s :15	NA’s :15	NA’s :30	NA’s :20	NA’s :57	NA’s :14	NA’s :5	NA’s :2	NA’s :1	NA’s :212	NA’s :1	NA	NA’s :4	NA’s :12	NA’s :2	NA’s :12	NA	NA’s :9	NA’s :10	NA’s :1

Exploratory Data Analysis

Visualizations like histograms and boxplots will be used to explore the distributions of numeric variables.

# Histograms for numeric variables

df_StudentData %>%

  select_if(is.numeric) %>%

  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%

  ggplot(aes(x = value)) +

  geom_histogram(aes(y=..density..), bins = 15, fill = "skyblue", alpha = 0.7, color = "black") +

  geom_density(color = "red", size = 1) +

  facet_wrap(~variable, scales = "free") +

  labs(title = "Histograms and Density Plots of Numeric Variables", x = "Value", y = "Density") +

  theme_minimal()

The training data shows different distribution patterns for the variables::

Relatively Normal Distributions: Carb.Pressure, Carb.Temp, Fill.Ounces, PC.Volume, PH (response variable)
Left-skew Distributions: Carb.Flow, Filler.Speed, Mnf.Flow, MFR, Bowl.Setpoint, Filler.Level, Hyd.Pressure2, Hyd.Pressure3, Usage.cont, Carb.Pressure1, Filler.Speed
Right-skew Distributions: Pressure.Setpoint, Fill.Pressure, Hyd.Pressure1, Temperature, Carb.Volume, PSC, PSC.CO2, PSC.Fill, Balling, Density, Hyd.Pressure4, Air.Pressurer, Alch.Rel, Carb.Rel, Oxygen.Filler, Balling.Lvl, Pressure.Vacuum

# Boxplots for numeric variables

df_StudentData %>%

  select_if(is.numeric) %>%

  gather(key = "variable", value = "value") %>%

  ggplot(aes(x = variable, y = value)) +

  geom_boxplot(fill = "orange", alpha = 0.7) +

  coord_flip() +

  ggtitle("Boxplots of Numerical Predictors")

## Warning: Removed 724 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Data Preparation

Convert Brand.Code to factor

We will transform categorical variable Brand.Code into factors and visualize for proportional distribution.

# Convert Brand.Code to factor
df_StudentData$Brand.Code <- as.factor(df_StudentData$Brand.Code)

df_EvalData$Brand.Code <- as.factor(df_EvalData$Brand.Code)

# Distribution of Brand.Code

df_StudentData %>%

  count(Brand.Code) %>%

  mutate(prop = n / sum(n)) %>%

  ggplot(aes(x = Brand.Code, y = prop, fill = Brand.Code)) +

  geom_col(show.legend = FALSE) +

  geom_text(aes(label = scales::percent(prop, accuracy = 0.1)), vjust = -0.5) +

  labs(title = "Proportion Distribution of Brand.Code", y = "Proportion", x = "Brand.Code") +

  scale_y_continuous(labels = scales::percent_format()) +

  theme_minimal()

Missing values imputation

The next phase tackles missing data and data quality issues. To address missing data, the Multiple Imputation by Chained Equations (MICE) method will be applied, filling in absent entries with statistically plausible values while maintaining the dataset’s integrity. After imputation, variables with near-zero variance will be removed to reduce noise.

# Impute missing values using mice with predictive mean matching (pmm)

df_StudentData_imp <- mice(df_StudentData, m = 1, method = 'pmm', printFlag = FALSE) %>% complete()

Check if any missing values left

# Check if any missing values left

df_StudentData_imp %>%

  summarise_all(~sum(is.na(.))) %>%

  gather(variable, missing) %>%

  filter(missing != 0) %>%

  kable() %>%

  kable_styling()

variable	missing
NA	NA
:——–	——-:

Remove near-zero variance variables

# Remove near-zero variance variables

nzv_vars <- nearZeroVar(df_StudentData_imp)

if(length(nzv_vars) > 0) {

  df_StudentData_imp <- df_StudentData_imp[, -nzv_vars]

}

Calculate skewness for numeric variables

Now we will identify highly skewed features and apply Box-Cox transformation to improve normality and model performance.

# Identify numeric columns (excluding target and factors)

numeric_vars <- df_StudentData_imp %>%

   select_if(is.numeric) %>%

  colnames()

# Calculate skewness for numeric variables

skew_vals <- sapply(df_StudentData_imp[, numeric_vars], skewness)
skew_vals

##       Carb.Volume       Fill.Ounces         PC.Volume     Carb.Pressure 
##       0.392987845      -0.018219011       0.350803833       0.215914218 
##         Carb.Temp               PSC          PSC.Fill           PSC.CO2 
##       0.293521050       0.845988838       0.934056193       1.733291995 
##          Mnf.Flow    Carb.Pressure1     Fill.Pressure     Hyd.Pressure2 
##       0.005689593       0.064502693       0.527630028      -0.302562814 
##     Hyd.Pressure3     Hyd.Pressure4      Filler.Level      Filler.Speed 
##      -0.319386874       0.570827505      -0.848527899      -2.522755492 
##       Temperature        Usage.cont         Carb.Flow           Density 
##       2.373388710      -0.536824745      -0.984828593       0.525817484 
##               MFR           Balling   Pressure.Vacuum                PH 
##      -2.789146468       0.594525892       0.525660793      -0.290771828 
##     Oxygen.Filler     Bowl.Setpoint Pressure.Setpoint     Air.Pressurer 
##       2.634209739      -0.974655906       0.203494663       2.252105286 
##          Alch.Rel          Carb.Rel       Balling.Lvl 
##       0.893490654       0.500206930       0.586417717

# Threshold for high skewness

skew_threshold <- 1

# Variables to transform (highly skewed)

vars_to_transform <- names(skew_vals[abs(skew_vals) > skew_threshold])

print("Highly skewed variables:")

## [1] "Highly skewed variables:"

print(vars_to_transform)

## [1] "PSC.CO2"       "Filler.Speed"  "Temperature"   "MFR"          
## [5] "Oxygen.Filler" "Air.Pressurer"

# Apply Box-Cox transformation to variables with high skewness

for (var in vars_to_transform) {

  # Extract variable vector

  x <- df_StudentData_imp[[var]]

  # Shift data if zero or negatives exist (Box-Cox requires positive values)

  min_x <- min(x, na.rm = TRUE)

  shift <- 0

  if (min_x <= 0) {

    shift <- abs(min_x) + 1

    x <- x + shift

    message(paste("Shifted", var, "by", shift, "to make positive for Box-Cox"))

  }

  # Estimate Box-Cox transformation

  bc_obj <- BoxCoxTrans(x)

  # Transform the data using estimated lambda

  x_bc <- predict(bc_obj, x)

  # Replace original variable with transformed variable (without shift to keep consistent)

  df_StudentData_imp[[var]] <- x_bc

}

## Shifted PSC.CO2 by 1 to make positive for Box-Cox

Recalculate skewness after Box-Cox transformation

skew_vals_after_bc <- sapply(df_StudentData_imp[, vars_to_transform], skewness)
print("Skewness after Box-Cox transformation:")

## [1] "Skewness after Box-Cox transformation:"

print(skew_vals_after_bc)

##       PSC.CO2  Filler.Speed   Temperature           MFR Oxygen.Filler 
##     1.2869311    -2.3511135     1.8528344    -2.3566364    -0.1216752 
## Air.Pressurer 
##     2.1883574

Cap outliers at 1st and 99th percentiles in all numeric variables

In this step outliers in numeric variables are capped at the 1st and 99th percentiles to mitigate their impact.

for (var in numeric_vars) {

  lower_bound <- quantile(df_StudentData_imp[[var]], 0.01, na.rm = TRUE)

  upper_bound <- quantile(df_StudentData_imp[[var]], 0.99, na.rm = TRUE)

  df_StudentData_imp[[var]] <- ifelse(df_StudentData_imp[[var]] < lower_bound, lower_bound, df_StudentData_imp[[var]])

  df_StudentData_imp[[var]] <- ifelse(df_StudentData_imp[[var]] > upper_bound, upper_bound, df_StudentData_imp[[var]])

}

Correlation Check

Now we will conduct correlation analysis between the target variable PH and all other numeric predictors to understand their relationships and inform feature selection.

# Prepare the data for correlation analysis
cor_data <- df_StudentData_imp %>%
    select_if(is.numeric)

# Compute correlations between 'PH' and all other predictors
corr_values <- cor_data %>%
    summarise(across(.cols = everything(), 
                     .fns = ~ cor(., cor_data$PH, use = "complete.obs"), 
                     .names = "cor_{col}")) %>%
    pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation") %>%
    mutate(Predictor = gsub("cor_", "", Predictor)) %>%
    filter(Predictor != "PH") %>% 
    arrange(desc(abs(Correlation)))


print(corr_values)

## # A tibble: 30 × 2
##    Predictor         Correlation
##    <chr>                   <dbl>
##  1 Mnf.Flow               -0.449
##  2 Bowl.Setpoint           0.354
##  3 Filler.Level            0.330
##  4 Usage.cont             -0.322
##  5 Pressure.Setpoint      -0.310
##  6 Hyd.Pressure3          -0.240
##  7 Pressure.Vacuum         0.219
##  8 Fill.Pressure          -0.218
##  9 Hyd.Pressure2          -0.204
## 10 Oxygen.Filler           0.203
## # ℹ 20 more rows

We analyzed how pH relates to all the factors we have to get a basic idea of what might affect it. By using correlation and visual tools, we pinpoint the variables that are most closely linked to pH. The correlation table ranks these predictors by how much they relate to pH. Mnf.Flow has the strongest negative correlation with pH at -0.44, while Bowl.Setpoint and Filler.level show a positive correlation of 0.35 and 0.33 respectively, suggesting it could influence pH, though these correlations are only moderate.

Data Split

Finally, the data is split into training and testing sets with stratification maintained for the target variable PH. Numeric predictors except the target are scaled using centering and scaling methods to prepare them for machine learning algorithms sensitive to feature scaling.

set.seed(100)
index <- createDataPartition(df_StudentData_imp$PH, p = 0.8, list = FALSE)
train_data <- df_StudentData_imp[index, ]
test_data <- df_StudentData_imp[-index, ]

# Separate predictors and target
train_x <- train_data %>% select(-PH)
train_y <- train_data$PH
test_x <- test_data %>% select(-PH)
test_y <- test_data$PH

# Distribution of target variable in train/test sets to check stratification
train_y_df <- data.frame(PH = train_y)
test_y_df <- data.frame(PH = test_y)

p1 <- ggplot(train_y_df, aes(x=PH)) +

  geom_histogram(bins = 20, fill = "steelblue", alpha=0.7) +

  labs(title = "Distribution of Target Variable PH in Training Set", x = "PH", y = "Count") +

  theme_minimal()

p2 <- ggplot(test_y_df, aes(x=PH)) +

  geom_histogram(bins = 20, fill = "tomato", alpha=0.7) +

  labs(title = "Distribution of Target Variable PH in Test Set", x = "PH", y = "Count") +

  theme_minimal()

ggarrange(p1, p2, ncol = 2, nrow = 1)

# Scaling numeric predictors for algorithms sensitive to scale (linear regression, neural networks, k-NN, SVMs, and gradient boosting can benefit from scaling)

scale_vars <- numeric_vars[numeric_vars != "PH"]

preProcValues <- preProcess(train_x[, scale_vars], method = c("center", "scale"))

train_x_scaled <- train_x

test_x_scaled <- test_x

train_x_scaled[, scale_vars] <- predict(preProcValues, train_x[, scale_vars])

test_x_scaled[, scale_vars] <- predict(preProcValues, test_x[, scale_vars])

The datasets are ready for model training and further analysis, ensuring that the challenges of missing data, skewness, outliers, and feature scaling have been addressed.

# Load required libraries
library(tidyverse)    #
library(caret)       
library(mice)         
library(kableExtra)   
library(corrplot)     
library(randomForest) 
library(gbm)          
library(nnet)         
library(Cubist)       
library(openxlsx)     
library(ggpubr)       
library(viridis)      
library(hrbrthemes)   
library(e1071)        
library(DT)           

df_StudentData <- read.csv('https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/StudentData.csv', na.strings = c("", NA))
df_EvalData <- read.csv('https://raw.githubusercontent.com/uplotnik/DATA-624/refs/heads/main/StudentEvaluation.csv', na.strings = c("", NA))
#Check first rows of beverage data
DT::datatable(
      df_StudentData[1:10,],
      options = list(scrollX = TRUE,
                     deferRender = TRUE,
                     dom = 'lBfrtip',
                     fixedColumns = TRUE, 
                     info = FALSE,  
                     paging=FALSE,
                     searching = FALSE), 
      rownames = FALSE,
      caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-size: 16px; font-weight: bold;',
    'Table 1: First 10 Rows of Beverage Data'
  )) 
DT::datatable(
      df_EvalData[1:10,],
      options = list(scrollX = TRUE,
                     deferRender = TRUE,
                     dom = 'lBfrtip',
                     fixedColumns = TRUE, 
                     info = FALSE,      
                     paging=FALSE,
                     searching = FALSE), 
      rownames = FALSE,
      caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; font-size: 16px; font-weight: bold;',
    'Table 2: First 10 Rows of Evaluation Data'
  )) 
# Finding data dimensions.
dims <- data.frame("Train" =  dim(df_StudentData),
                   "Eval" = dim(df_EvalData))
rownames(dims) <- c("Observations","Predictors")
dims
glimpse(df_StudentData)
summary(df_StudentData) %>%
  kable() %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed")) %>%
  scroll_box(width = "100%", height = "250px")
# Histograms for numeric variables

df_StudentData %>%

  select_if(is.numeric) %>%

  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%

  ggplot(aes(x = value)) +

  geom_histogram(aes(y=..density..), bins = 15, fill = "skyblue", alpha = 0.7, color = "black") +

  geom_density(color = "red", size = 1) +

  facet_wrap(~variable, scales = "free") +

  labs(title = "Histograms and Density Plots of Numeric Variables", x = "Value", y = "Density") +

  theme_minimal()
# Boxplots for numeric variables

df_StudentData %>%

  select_if(is.numeric) %>%

  gather(key = "variable", value = "value") %>%

  ggplot(aes(x = variable, y = value)) +

  geom_boxplot(fill = "orange", alpha = 0.7) +

  coord_flip() +

  ggtitle("Boxplots of Numerical Predictors")
# Convert Brand.Code to factor
df_StudentData$Brand.Code <- as.factor(df_StudentData$Brand.Code)

df_EvalData$Brand.Code <- as.factor(df_EvalData$Brand.Code)
# Distribution of Brand.Code

df_StudentData %>%

  count(Brand.Code) %>%

  mutate(prop = n / sum(n)) %>%

  ggplot(aes(x = Brand.Code, y = prop, fill = Brand.Code)) +

  geom_col(show.legend = FALSE) +

  geom_text(aes(label = scales::percent(prop, accuracy = 0.1)), vjust = -0.5) +

  labs(title = "Proportion Distribution of Brand.Code", y = "Proportion", x = "Brand.Code") +

  scale_y_continuous(labels = scales::percent_format()) +

  theme_minimal()
# Impute missing values using mice with predictive mean matching (pmm)

df_StudentData_imp <- mice(df_StudentData, m = 1, method = 'pmm', printFlag = FALSE) %>% complete()
# Check if any missing values left

df_StudentData_imp %>%

  summarise_all(~sum(is.na(.))) %>%

  gather(variable, missing) %>%

  filter(missing != 0) %>%

  kable() %>%

  kable_styling()
# Remove near-zero variance variables

nzv_vars <- nearZeroVar(df_StudentData_imp)

if(length(nzv_vars) > 0) {

  df_StudentData_imp <- df_StudentData_imp[, -nzv_vars]

}
# Identify numeric columns (excluding target and factors)

numeric_vars <- df_StudentData_imp %>%

   select_if(is.numeric) %>%

  colnames()
# Calculate skewness for numeric variables

skew_vals <- sapply(df_StudentData_imp[, numeric_vars], skewness)
skew_vals
# Threshold for high skewness

skew_threshold <- 1
# Variables to transform (highly skewed)

vars_to_transform <- names(skew_vals[abs(skew_vals) > skew_threshold])

print("Highly skewed variables:")
print(vars_to_transform)
# Apply Box-Cox transformation to variables with high skewness

for (var in vars_to_transform) {

  # Extract variable vector

  x <- df_StudentData_imp[[var]]

  # Shift data if zero or negatives exist (Box-Cox requires positive values)

  min_x <- min(x, na.rm = TRUE)

  shift <- 0

  if (min_x <= 0) {

    shift <- abs(min_x) + 1

    x <- x + shift

    message(paste("Shifted", var, "by", shift, "to make positive for Box-Cox"))

  }

  # Estimate Box-Cox transformation

  bc_obj <- BoxCoxTrans(x)

  # Transform the data using estimated lambda

  x_bc <- predict(bc_obj, x)

  # Replace original variable with transformed variable (without shift to keep consistent)

  df_StudentData_imp[[var]] <- x_bc

}
skew_vals_after_bc <- sapply(df_StudentData_imp[, vars_to_transform], skewness)
print("Skewness after Box-Cox transformation:")
print(skew_vals_after_bc)
for (var in numeric_vars) {

  lower_bound <- quantile(df_StudentData_imp[[var]], 0.01, na.rm = TRUE)

  upper_bound <- quantile(df_StudentData_imp[[var]], 0.99, na.rm = TRUE)

  df_StudentData_imp[[var]] <- ifelse(df_StudentData_imp[[var]] < lower_bound, lower_bound, df_StudentData_imp[[var]])

  df_StudentData_imp[[var]] <- ifelse(df_StudentData_imp[[var]] > upper_bound, upper_bound, df_StudentData_imp[[var]])

}

# Prepare the data for correlation analysis
cor_data <- df_StudentData_imp %>%
    select_if(is.numeric)

# Compute correlations between 'PH' and all other predictors
corr_values <- cor_data %>%
    summarise(across(.cols = everything(), 
                     .fns = ~ cor(., cor_data$PH, use = "complete.obs"), 
                     .names = "cor_{col}")) %>%
    pivot_longer(cols = everything(), names_to = "Predictor", values_to = "Correlation") %>%
    mutate(Predictor = gsub("cor_", "", Predictor)) %>%
    filter(Predictor != "PH") %>% 
    arrange(desc(abs(Correlation)))


print(corr_values)
set.seed(100)
index <- createDataPartition(df_StudentData_imp$PH, p = 0.8, list = FALSE)
train_data <- df_StudentData_imp[index, ]
test_data <- df_StudentData_imp[-index, ]
# Separate predictors and target
train_x <- train_data %>% select(-PH)
train_y <- train_data$PH
test_x <- test_data %>% select(-PH)
test_y <- test_data$PH
# Distribution of target variable in train/test sets to check stratification
train_y_df <- data.frame(PH = train_y)
test_y_df <- data.frame(PH = test_y)

p1 <- ggplot(train_y_df, aes(x=PH)) +

  geom_histogram(bins = 20, fill = "steelblue", alpha=0.7) +

  labs(title = "Distribution of Target Variable PH in Training Set", x = "PH", y = "Count") +

  theme_minimal()

p2 <- ggplot(test_y_df, aes(x=PH)) +

  geom_histogram(bins = 20, fill = "tomato", alpha=0.7) +

  labs(title = "Distribution of Target Variable PH in Test Set", x = "PH", y = "Count") +

  theme_minimal()

ggarrange(p1, p2, ncol = 2, nrow = 1)

# Scaling numeric predictors for algorithms sensitive to scale (linear regression, neural networks, k-NN, SVMs, and gradient boosting can benefit from scaling)

scale_vars <- numeric_vars[numeric_vars != "PH"]

preProcValues <- preProcess(train_x[, scale_vars], method = c("center", "scale"))

train_x_scaled <- train_x

test_x_scaled <- test_x

train_x_scaled[, scale_vars] <- predict(preProcValues, train_x[, scale_vars])

test_x_scaled[, scale_vars] <- predict(preProcValues, test_x[, scale_vars])
#Full code

Data 624 Project 2

Team Assignment

2025-05-03

Instructions

Load datasets

Exploratory Data Analysis

Data Preparation