Objective
Following new regulations, understand manufacturing process at ABC beverage and produce predictive model for PH.

Deliverables
- Business report (non-technical)
- Predictions in an Excel readable format
- Technical report outlining tested models and selection process

Exploratory Data Analysis

# Read data for train/test, as well as new data (without target) for predictions
train <- read_excel('StudentData.xlsx')

# Dataset structure
str(train)

## tibble [2,571 x 33] (S3: tbl_df/tbl/data.frame)
##  $ Brand Code       : chr [1:2571] "B" "A" "B" "A" ...
##  $ Carb Volume      : num [1:2571] 5.34 5.43 5.29 5.44 5.49 ...
##  $ Fill Ounces      : num [1:2571] 24 24 24.1 24 24.3 ...
##  $ PC Volume        : num [1:2571] 0.263 0.239 0.263 0.293 0.111 ...
##  $ Carb Pressure    : num [1:2571] 68.2 68.4 70.8 63 67.2 66.6 64.2 67.6 64.2 72 ...
##  $ Carb Temp        : num [1:2571] 141 140 145 133 137 ...
##  $ PSC              : num [1:2571] 0.104 0.124 0.09 NA 0.026 0.09 0.128 0.154 0.132 0.014 ...
##  $ PSC Fill         : num [1:2571] 0.26 0.22 0.34 0.42 0.16 ...
##  $ PSC CO2          : num [1:2571] 0.04 0.04 0.16 0.04 0.12 ...
##  $ Mnf Flow         : num [1:2571] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb Pressure1   : num [1:2571] 119 122 120 115 118 ...
##  $ Fill Pressure    : num [1:2571] 46 46 46 46.4 45.8 45.6 51.8 46.8 46 45.2 ...
##  $ Hyd Pressure1    : num [1:2571] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure2    : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure3    : num [1:2571] NA NA NA 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure4    : num [1:2571] 118 106 82 92 92 116 124 132 90 108 ...
##  $ Filler Level     : num [1:2571] 121 119 120 118 119 ...
##  $ Filler Speed     : num [1:2571] 4002 3986 4020 4012 4010 ...
##  $ Temperature      : num [1:2571] 66 67.6 67 65.6 65.6 66.2 65.8 65.2 65.4 66.6 ...
##  $ Usage cont       : num [1:2571] 16.2 19.9 17.8 17.4 17.7 ...
##  $ Carb Flow        : num [1:2571] 2932 3144 2914 3062 3054 ...
##  $ Density          : num [1:2571] 0.88 0.92 1.58 1.54 1.54 1.52 0.84 0.84 0.9 0.9 ...
##  $ MFR              : num [1:2571] 725 727 735 731 723 ...
##  $ Balling          : num [1:2571] 1.4 1.5 3.14 3.04 3.04 ...
##  $ Pressure Vacuum  : num [1:2571] -4 -4 -3.8 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 -4.4 ...
##  $ PH               : num [1:2571] 8.36 8.26 8.94 8.24 8.26 8.32 8.4 8.38 8.38 8.5 ...
##  $ Oxygen Filler    : num [1:2571] 0.022 0.026 0.024 0.03 0.03 0.024 0.066 0.046 0.064 0.022 ...
##  $ Bowl Setpoint    : num [1:2571] 120 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure Setpoint: num [1:2571] 46.4 46.8 46.6 46 46 46 46 46 46 46 ...
##  $ Air Pressurer    : num [1:2571] 143 143 142 146 146 ...
##  $ Alch Rel         : num [1:2571] 6.58 6.56 7.66 7.14 7.14 7.16 6.54 6.52 6.52 6.54 ...
##  $ Carb Rel         : num [1:2571] 5.32 5.3 5.84 5.42 5.44 5.44 5.38 5.34 5.34 5.34 ...
##  $ Balling Lvl      : num [1:2571] 1.48 1.56 3.28 3.04 3.04 3.02 1.44 1.44 1.44 1.38 ...

# Basic statistics
descr <- describe(train)
descr <- transform(descr, na = nrow(train)-n) # add a column with NA count
descr <- transform(descr, na_perc = na/nrow(train)*100)
descr <- round(descr,2)
descr[,c("n","mean","sd","min","max","range","skew","na","na_perc")]

##                      n    mean      sd     min     max   range  skew  na
## Brand Code*       2451     NaN      NA     Inf    -Inf    -Inf    NA 120
## Carb Volume       2561    5.37    0.11    5.04    5.70    0.66  0.39  10
## Fill Ounces       2533   23.97    0.09   23.63   24.32    0.69 -0.02  38
## PC Volume         2532    0.28    0.06    0.08    0.48    0.40  0.34  39
## Carb Pressure     2544   68.19    3.54   57.00   79.40   22.40  0.18  27
## Carb Temp         2545  141.09    4.04  128.60  154.00   25.40  0.25  26
## PSC               2538    0.08    0.05    0.00    0.27    0.27  0.85  33
## PSC Fill          2548    0.20    0.12    0.00    0.62    0.62  0.93  23
## PSC CO2           2532    0.06    0.04    0.00    0.24    0.24  1.73  39
## Mnf Flow          2569   24.57  119.48 -100.20  229.40  329.60  0.00   2
## Carb Pressure1    2539  122.59    4.74  105.60  140.20   34.60  0.05  32
## Fill Pressure     2549   47.92    3.18   34.60   60.40   25.80  0.55  22
## Hyd Pressure1     2560   12.44   12.43   -0.80   58.00   58.80  0.78  11
## Hyd Pressure2     2556   20.96   16.39    0.00   59.40   59.40 -0.30  15
## Hyd Pressure3     2556   20.46   15.98   -1.20   50.00   51.20 -0.32  15
## Hyd Pressure4     2541   96.29   13.12   52.00  142.00   90.00  0.55  30
## Filler Level      2551  109.25   15.70   55.80  161.20  105.40 -0.85  20
## Filler Speed      2514 3687.20  770.82  998.00 4030.00 3032.00 -2.87  57
## Temperature       2557   65.97    1.38   63.60   76.20   12.60  2.39  14
## Usage cont        2566   20.99    2.98   12.08   25.90   13.82 -0.54   5
## Carb Flow         2569 2468.35 1073.70   26.00 5104.00 5078.00 -0.99   2
## Density           2570    1.17    0.38    0.24    1.92    1.68  0.53   1
## MFR               2359  704.05   73.90   31.40  868.60  837.20 -5.09 212
## Balling           2570    2.20    0.93   -0.17    4.01    4.18  0.59   1
## Pressure Vacuum   2571   -5.22    0.57   -6.60   -3.60    3.00  0.53   0
## PH                2567    8.55    0.17    7.88    9.36    1.48 -0.29   4
## Oxygen Filler     2559    0.05    0.05    0.00    0.40    0.40  2.66  12
## Bowl Setpoint     2569  109.33   15.30   70.00  140.00   70.00 -0.97   2
## Pressure Setpoint 2559   47.62    2.04   44.00   52.00    8.00  0.20  12
## Air Pressurer     2571  142.83    1.21  140.80  148.20    7.40  2.25   0
## Alch Rel          2562    6.90    0.51    5.28    8.62    3.34  0.88   9
## Carb Rel          2561    5.44    0.13    4.96    6.06    1.10  0.50  10
## Balling Lvl       2570    2.05    0.87    0.00    3.66    3.66  0.59   1
##                   na_perc
## Brand Code*          4.67
## Carb Volume          0.39
## Fill Ounces          1.48
## PC Volume            1.52
## Carb Pressure        1.05
## Carb Temp            1.01
## PSC                  1.28
## PSC Fill             0.89
## PSC CO2              1.52
## Mnf Flow             0.08
## Carb Pressure1       1.24
## Fill Pressure        0.86
## Hyd Pressure1        0.43
## Hyd Pressure2        0.58
## Hyd Pressure3        0.58
## Hyd Pressure4        1.17
## Filler Level         0.78
## Filler Speed         2.22
## Temperature          0.54
## Usage cont           0.19
## Carb Flow            0.08
## Density              0.04
## MFR                  8.25
## Balling              0.04
## Pressure Vacuum      0.00
## PH                   0.16
## Oxygen Filler        0.47
## Bowl Setpoint        0.08
## Pressure Setpoint    0.47
## Air Pressurer        0.00
## Alch Rel             0.35
## Carb Rel             0.39
## Balling Lvl          0.04

The dataset consists of 2571 observations of 32 predictors and a target variable. Of 32 variables, 31 are numerical and 1 (Brand Code) is categorical.

There is a fraction of records (4/0.16%) that does not have corresponding target variable. As we’re unable to use those records for training purposes and as they represent such a small portion, we will remove them from data.

# removing observations wiht missing target variable
train <- train[!is.na(train$PH),]

Target variable (PH) values range from 7.88 to 9.36 and in general follow normal distribution.

ggplot(train,aes(x=PH)) + geom_histogram() + ggtitle('PH Distribution') + stat_bin(bins=15)

There are two variables that have more than 3% missing values:

subset(descr, na_perc >= 3, select=c(na_perc))

##             na_perc
## Brand Code*    4.67
## MFR            8.25

Brand is categorical, we will create a dummy variable representing each category as a numerical value so that we can include it in the modeling process.

# rename Brand Code and create dummies
names(train)[names(train) == "Brand Code"] <- "brand"
train <- dummy_cols(train)

brand_df <- subset(train,select=c(PH,brand_A,brand_B,brand_C,brand_D))
brand_long <- gather(brand_df, key = "variable", value = "value",-PH)
ggplot(brand_long, aes(x = value, y = PH)) +
  geom_point() + facet_grid(~ variable, scales = "free_x")

Brand in general doesn’t appear to be impacting the values of PH, however, it’s worth noting the the presence of brands A and D seems to narrow value range for PH, whereas brand C appears to lower value of PH.

It is unclear what variable MFR represents but some research suggests that MFR refers to membrane-integrated fermentation reactor (MFR) system that is used in fermentation by retaining cells and was found to reduce the amount of supplied sub-raw material that could lead to reduced costs in manufacturing process.

ggplot(train, aes(x = MFR, y = PH)) + geom_point()

## distribution
train%>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_density()

Looking at variables distributions and statistical summary, MFR is the most skewed with skeweness values of -5.2. Applying both log and spatial sign transformation don’t appear to work which could potentially inhibit including this variable in the modeling.

ggplot(train) + geom_density(aes(x=log(MFR)))

mfr_ss <- spatialSign(train$MFR)
ggplot(train) + geom_density(aes(x=mfr_ss))

ggplot(train, aes(x = log(MFR), y = PH)) + geom_point()

ggplot(train, aes(x = mfr_ss, y = PH)) + geom_point()

Let’s split the data into target and features and explore correlations between target and variables, as well as between variables.

# separate data into features and target
features <- as.data.frame(subset(train,select= -PH))
features <- subset(features,select = -c(brand,brand_NA))
PH <- as.data.frame(subset(train,select=PH))

# plot correlations
correlations <- cor(cbind(PH,features),use="pairwise.complete.obs")
corrplot::corrplot(correlations, type="lower", tl.cex = 0.5)

There are no strong correlations between PH and other variables, however, Mnf Flow exhibits strongest negative correlation and Bowl Setpoint has the strongest positive relationship .

rbind(subset(as.data.frame(correlations), PH >= 0.2, select=c(PH)),subset(as.data.frame(correlations), PH <= -0.3, select=c(PH)))

##                           PH
## PH                 1.0000000
## Filler Level       0.3232703
## Pressure Vacuum    0.2205372
## Bowl Setpoint      0.3491198
## Mnf Flow          -0.4469751
## Usage cont        -0.3184155
## Pressure Setpoint -0.3110191

ggplot(train, aes(x = train$`Mnf Flow`, y = PH)) + geom_point() + ggtitle('PH vs Mnf Flow')

ggplot(train, aes(x = train$`Bowl Setpoint`, y = PH)) + geom_point() + ggtitle('PH vs Bowl Setpoint')

ggplot(train, aes(x = train$`Filler Level`, y = PH)) + geom_point() + ggtitle('PH vs Filler Level')

There is a number of predictors that are highly correlated, such as Balling/Density, Hyd Pressure2/Hyd Pressure3, Bowl Setpoint/Filler Lvl, and Balling Lvl/Balling. We will remove those from data as part of data processing.

ggplot(train, aes(x = Balling, y = Density)) + geom_point() + ggtitle('Balling vs Density')

ggplot(train, aes(x = `Hyd Pressure2`, y = `Hyd Pressure3`)) + geom_point() + ggtitle('Hyd Pressure2 vs Hyd Pressure3')

Looking at the individual distributions, some variable require additional preprocessing to account for different scale, skeweness and outliers.

# Distribution of each variable
features %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(x=key, y=value)) +
  geom_boxplot() +
  stat_summary(fun.y = mean, color = "blue", geom = "point") +  
  stat_summary(fun.y = median, color = "red", geom = "point") +
  coord_flip() + 
  labs(title = "Train variables distribution", x = element_blank(), y = element_blank())

# Zoomed in distribution excluding bigger vars
features2 <- subset(features,select=-c(MFR,`Filler Speed`,`Carb Flow`))
features2 %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(x=key, y=value)) +
  geom_boxplot() +
  stat_summary(fun.y = mean, color = "blue", geom = "point") +  
  stat_summary(fun.y = median, color = "red", geom = "point") +
  coord_flip() + 
  labs(title = "Train variables distribution (excl. MFR, Filler Speed,Carb Flow)", x = element_blank(), y = element_blank())

Additional Preprocessing

## scale, center, impute NAs
prep <- preProcess(features, method=c('scale','center','knnImpute','BoxCox'))
prep_features <- predict(prep,features)


# split data into train and test
set.seed(1)
split <- createDataPartition(train$PH,p=0.75,list=FALSE)
x_train <- prep_features[split,]
y_train <- PH[split,]
x_test <- prep_features[-split,]
y_test <- PH[-split,]

# remove near zero variance features
pred_to_remove <- nearZeroVar(features)
x_train <- x_train[-pred_to_remove]
x_test <- x_test[-pred_to_remove]

# remove highly correlated features
corThresh <- 0.9
tooHigh <- findCorrelation(cor(x_train),corThresh)
x_train <- x_train[,-tooHigh]
x_test <- x_test[,-tooHigh]

set.seed(1)
ctrl <- trainControl(method='cv',number=10)

Features distribution after preprocessing shows that despite preprocessing there is high number of skewed features and outliers that can impact modeling process. Given that information, it is possible that models that are more flexible with outliers (such as trees) have a better chance at performing well over models that might be influenced by outliers (such as linear).

x_train %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(x=key, y=value)) +
  geom_boxplot(outlier.size = 1) +
  stat_summary(fun.y = mean, color = "blue", geom = "point") +  
  stat_summary(fun.y = median, color = "red", geom = "point") +
  coord_flip() + 
  labs(title = "", x = element_blank(), y = element_blank())

Models

Linear

Linear models are highly interpretable. Moreover, they enable us to compute standard errors of the coefficients that can be used to assess the statistical significance of each predictor, which can provide a better understanding of the relationships between variables. Despite all the advantages of linear models, nonlinear relationships between predictors and target may not be adequately captured.

Multivariate regression with all variables - base

The objective of ordinary least squares is to find the plane that minimizes SSE between observed and predicted response.

set.seed(1)
lm_all <- lm(y_train ~ ., data = x_train)
summary(lm_all)

## 
## Call:
## lm(formula = y_train ~ ., data = x_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49814 -0.08236  0.00881  0.08958  0.57967 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          8.548e+00  3.172e-03 2694.856  < 2e-16 ***
## `Carb Volume`       -8.336e-03  8.375e-03   -0.995 0.319699    
## `Fill Ounces`       -3.831e-03  3.333e-03   -1.149 0.250574    
## `PC Volume`         -9.816e-06  3.720e-03   -0.003 0.997895    
## `Carb Pressure`     -6.144e-04  1.164e-02   -0.053 0.957903    
## `Carb Temp`          4.621e-03  1.060e-02    0.436 0.662805    
## PSC                 -7.824e-03  3.301e-03   -2.370 0.017895 *  
## `PSC Fill`          -6.977e-04  3.187e-03   -0.219 0.826737    
## `PSC CO2`           -5.532e-03  3.196e-03   -1.731 0.083634 .  
## `Mnf Flow`          -8.123e-02  6.453e-03  -12.588  < 2e-16 ***
## `Carb Pressure1`     2.743e-02  3.854e-03    7.118 1.54e-12 ***
## `Fill Pressure`      1.266e-02  4.428e-03    2.858 0.004309 ** 
## `Hyd Pressure2`      1.881e-02  4.906e-03    3.834 0.000130 ***
## `Hyd Pressure4`     -1.863e-03  4.849e-03   -0.384 0.700794    
## `Filler Level`       1.748e-02  4.681e-03    3.734 0.000194 ***
## `Filler Speed`      -3.512e-03  5.993e-03   -0.586 0.557995    
## Temperature         -2.006e-02  3.502e-03   -5.727 1.18e-08 ***
## `Usage cont`        -2.402e-02  3.978e-03   -6.038 1.87e-09 ***
## `Carb Flow`          9.598e-03  4.441e-03    2.161 0.030805 *  
## Density             -4.692e-02  7.742e-03   -6.061 1.63e-09 ***
## MFR                  2.732e-03  4.736e-03    0.577 0.564064    
## `Pressure Vacuum`   -5.207e-03  4.115e-03   -1.265 0.205984    
## `Oxygen Filler`     -1.431e-02  4.382e-03   -3.265 0.001116 ** 
## `Pressure Setpoint` -1.849e-02  4.560e-03   -4.055 5.22e-05 ***
## `Air Pressurer`     -1.041e-03  3.267e-03   -0.319 0.750129    
## `Alch Rel`           1.992e-02  1.138e-02    1.750 0.080234 .  
## `Carb Rel`           1.929e-02  6.752e-03    2.857 0.004324 ** 
## brand_A             -2.117e-02  5.598e-03   -3.781 0.000161 ***
## brand_B             -2.006e-02  1.421e-02   -1.412 0.158237    
## brand_C             -5.809e-02  9.712e-03   -5.981 2.64e-09 ***
## brand_D                     NA         NA       NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1355 on 1897 degrees of freedom
## Multiple R-squared:  0.3936, Adjusted R-squared:  0.3844 
## F-statistic: 42.46 on 29 and 1897 DF,  p-value: < 2.2e-16

predictions <- predict(lm_all,x_test)
values <- data.frame(obs = y_test, pred = predictions)
defaultSummary(values)

##      RMSE  Rsquared       MAE 
## 0.1340482 0.3939160 0.1034682

lm_all_rmse_train <- sqrt(mean(lm_all$residuals^2))
lm_all_rsq_train <- summary(lm_all)$r.squared
lm_all_rmse_test <- defaultSummary(values)[1]
lm_all_rsq_test <- defaultSummary(values)[2]
lm_all_res <- c(lm_all_rmse_train,lm_all_rmse_test,lm_all_rsq_train,lm_all_rsq_test)

Backward stepwise elimination based on AIC

A number of variables do not carry any significance. We will perform stepwise elimiation on teh orignial linear model and used the results to feed into reduced linear model.

set.seed(1)
step <- stepAIC(lm_all, direction="backward",trace=0)
summary(step)

## 
## Call:
## lm(formula = y_train ~ `Carb Volume` + PSC + `PSC CO2` + `Mnf Flow` + 
##     `Carb Pressure1` + `Fill Pressure` + `Hyd Pressure2` + `Filler Level` + 
##     Temperature + `Usage cont` + `Carb Flow` + Density + `Oxygen Filler` + 
##     `Pressure Setpoint` + `Alch Rel` + `Carb Rel` + brand_A + 
##     brand_B + brand_C, data = x_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49505 -0.08182  0.00981  0.09102  0.58006 
## 
## Coefficients:
##                      Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          8.547613   0.003109 2749.648  < 2e-16 ***
## `Carb Volume`       -0.011475   0.005845   -1.963  0.04977 *  
## PSC                 -0.008185   0.003132   -2.613  0.00905 ** 
## `PSC CO2`           -0.005759   0.003118   -1.847  0.06489 .  
## `Mnf Flow`          -0.079881   0.006322  -12.636  < 2e-16 ***
## `Carb Pressure1`     0.028033   0.003748    7.479 1.13e-13 ***
## `Fill Pressure`      0.012130   0.004292    2.826  0.00476 ** 
## `Hyd Pressure2`      0.021747   0.004390    4.954 7.92e-07 ***
## `Filler Level`       0.017726   0.004487    3.950 8.10e-05 ***
## Temperature         -0.019723   0.003355   -5.879 4.87e-09 ***
## `Usage cont`        -0.024568   0.003856   -6.371 2.35e-10 ***
## `Carb Flow`          0.008792   0.003986    2.206  0.02753 *  
## Density             -0.045037   0.007579   -5.942 3.33e-09 ***
## `Oxygen Filler`     -0.014012   0.004254   -3.294  0.00101 ** 
## `Pressure Setpoint` -0.018286   0.004497   -4.067 4.97e-05 ***
## `Alch Rel`           0.020103   0.011259    1.786  0.07434 .  
## `Carb Rel`           0.020212   0.006613    3.056  0.00227 ** 
## brand_A             -0.022603   0.005181   -4.363 1.35e-05 ***
## brand_B             -0.021928   0.014005   -1.566  0.11759    
## brand_C             -0.059147   0.009509   -6.220 6.09e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1353 on 1907 degrees of freedom
## Multiple R-squared:  0.3919, Adjusted R-squared:  0.3859 
## F-statistic: 64.69 on 19 and 1907 DF,  p-value: < 2.2e-16

Reduced linear

Reduced linear model include only most significant features from step model.

set.seed(1)
lm_reduced <- lm(y_train ~ PSC + `PSC CO2` + `Mnf Flow` + `Carb Pressure1` + 
    `Fill Pressure` + `Hyd Pressure2` + `Filler Level` + Temperature + 
    `Usage cont` + `Carb Flow` + Density + `Oxygen Filler` + 
    `Pressure Setpoint` + `Alch Rel` + `Carb Rel` + brand_A + 
    brand_C, data = x_train)
summary(lm_reduced)

## 
## Call:
## lm(formula = y_train ~ PSC + `PSC CO2` + `Mnf Flow` + `Carb Pressure1` + 
##     `Fill Pressure` + `Hyd Pressure2` + `Filler Level` + Temperature + 
##     `Usage cont` + `Carb Flow` + Density + `Oxygen Filler` + 
##     `Pressure Setpoint` + `Alch Rel` + `Carb Rel` + brand_A + 
##     brand_C, data = x_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50035 -0.08140  0.00962  0.09045  0.59291 
## 
## Coefficients:
##                      Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          8.546991   0.003090 2766.451  < 2e-16 ***
## PSC                 -0.008533   0.003131   -2.725  0.00649 ** 
## `PSC CO2`           -0.005334   0.003114   -1.713  0.08690 .  
## `Mnf Flow`          -0.079954   0.006320  -12.650  < 2e-16 ***
## `Carb Pressure1`     0.028355   0.003746    7.570 5.76e-14 ***
## `Fill Pressure`      0.012024   0.004293    2.801  0.00514 ** 
## `Hyd Pressure2`      0.022486   0.004355    5.163 2.68e-07 ***
## `Filler Level`       0.018860   0.004457    4.231 2.43e-05 ***
## Temperature         -0.019369   0.003343   -5.793 8.06e-09 ***
## `Usage cont`        -0.025132   0.003848   -6.531 8.35e-11 ***
## `Carb Flow`          0.009219   0.003964    2.326  0.02014 *  
## Density             -0.043646   0.007297   -5.981 2.63e-09 ***
## `Oxygen Filler`     -0.013773   0.004255   -3.237  0.00123 ** 
## `Pressure Setpoint` -0.018904   0.004486   -4.214 2.62e-05 ***
## `Alch Rel`           0.031181   0.007478    4.170 3.19e-05 ***
## `Carb Rel`           0.016204   0.006069    2.670  0.00765 ** 
## brand_A             -0.016479   0.003624   -4.547 5.77e-06 ***
## brand_C             -0.044862   0.003339  -13.435  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1354 on 1909 degrees of freedom
## Multiple R-squared:  0.3904, Adjusted R-squared:  0.3849 
## F-statistic:  71.9 on 17 and 1909 DF,  p-value: < 2.2e-16

predictions <- predict(lm_reduced,x_test)
values <- data.frame(obs = y_test, pred = predictions)
defaultSummary(values)

##      RMSE  Rsquared       MAE 
## 0.1343156 0.3914480 0.1035036

lm_red_rmse_train <- sqrt(mean(lm_reduced$residuals^2))
lm_red_rsq_train <- summary(lm_reduced)$r.squared
lm_red_rmse_test <- defaultSummary(values)[1]
lm_red_rsq_test <- defaultSummary(values)[2]
lm_red_res <- c(lm_red_rmse_train,lm_red_rmse_test,lm_red_rsq_train,lm_red_rsq_test)

PLS

Our data contains correlated variable which typically could cause the model to become unstable. Although we have removed those highly correlated variables during preprocessing, this process does not ensure that linear combinations of predictors are uncorrelated with other predictors. One of the solutions to deal with this is implementing PLS which finds linear combinations of the predictors (called components of latent variables). PLS components are chosen to maximally summarize covariance with the response, meaning that, unlike PCA, PLS requires the components to have correlations with the response as well.

set.seed(1)
pls_model <- train(x=x_train,
                   y=y_train, 
                   method='pls', 
                   trControl=ctrl, 
                   tuneLength = 10) 
pls_model

## Partial Least Squares 
## 
## 1927 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1734, 1734, 1734, 1735, 1734, 1734, ... 
## Resampling results across tuning parameters:
## 
##   ncomp  RMSE       Rsquared   MAE      
##    1     0.1489636  0.2568308  0.1176501
##    2     0.1426785  0.3206904  0.1118853
##    3     0.1403072  0.3448870  0.1106603
##    4     0.1385840  0.3604581  0.1091929
##    5     0.1381025  0.3645576  0.1082952
##    6     0.1374136  0.3704032  0.1076334
##    7     0.1372897  0.3719071  0.1075234
##    8     0.1372293  0.3727878  0.1072861
##    9     0.1371417  0.3736025  0.1071334
##   10     0.1371176  0.3738746  0.1070728
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was ncomp = 10.

plot(pls_model,metric="RMSE")

set.seed(1)
predictions <- predict(pls_model,x_test)
values <- data.frame(obs = y_test, pred = predictions)
defaultSummary(values)

##      RMSE  Rsquared       MAE 
## 0.1340967 0.3935254 0.1034844

pls_rmse_train <- max(pls_model$results$RMSE)
pls_rsq_train <- max(pls_model$results$Rsquared)
pls_rmse_test <- defaultSummary(values)[1]
pls_rsq_test <- defaultSummary(values)[2]
pls_res <- c(pls_rmse_train,pls_rmse_test,pls_rsq_train,pls_rsq_test)

Non-linear

MARS

MARS uses surrogate features instead of original predictors, but unlike PLS, those features are less complex and only consist of one or two predictors at a time. MARS can be viewed as an extension of linear model and often is referred to as piecewise linear model due to it adding new features to basic linear regression.

set.seed(1)
marsModel <- train(x = x_train,
                  y = y_train,
                  method = "earth",
                  preProc = c('center','scale'),
                  tuneLength = 10)
marsModel

## Multivariate Adaptive Regression Spline 
## 
## 1927 samples
##   30 predictor
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1927, 1927, 1927, 1927, 1927, 1927, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE       Rsquared   MAE      
##    2      0.1542519  0.2145446  0.1207680
##    3      0.1456191  0.3003308  0.1143758
##    5      0.1434971  0.3207035  0.1125989
##    7      0.1403150  0.3509996  0.1091066
##    8      0.1391797  0.3617256  0.1079560
##   10      0.1370699  0.3811620  0.1056698
##   12      0.1415932  0.3777469  0.1052247
##   13      0.1424779  0.3743081  0.1051814
##   15      0.1430844  0.3745112  0.1049641
##   17      0.1435887  0.3713305  0.1049046
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nprune = 10 and degree = 1.

set.seed(1)
marsPred <- predict(marsModel, x_test)
## The function 'postResample' can be used to get the test set performance values
postResample(pred = marsPred, obs = y_test)

##      RMSE  Rsquared       MAE 
## 0.1316172 0.4158437 0.1008916

plot(marsModel)

varImp(marsModel)

## earth variable importance
## 
##   only 20 most important variables shown (out of 36)
## 
##                  Overall
## `MnfFlow`         100.00
## brand_C            68.70
## `PressureVacuum`   51.00
## `AlchRel`          45.61
## Density            41.49
## `Usagecont`        35.90
## Temperature        30.71
## `CarbPressure1`    23.45
## `OxygenFiller`     14.12
## brand_D             0.00
## CarbPressure1       0.00
## CarbRel             0.00
## HydPressure2        0.00
## CarbTemp            0.00
## FillPressure        0.00
## AirPressurer        0.00
## CarbVolume          0.00
## brand_A             0.00
## FillOunces          0.00
## PressureVacuum      0.00

mars_rmse_train <- max(marsModel$results$RMSE)
mars_rsq_train <- max(marsModel$results$Rsquared)
mars_rmse_test <- postResample(pred = marsPred, obs = y_test)[1]
mars_rsq_test <- postResample(pred = marsPred, obs = y_test)[2]
mars_res <- c(mars_rmse_train,mars_rmse_test,mars_rsq_train,mars_rsq_test)

SVM

Linear models seek to minimize SSE which can be influenced by outliers. SVM uses different function (similar to Huber using squared residuals when they’re small vs absolute residuals when they’re large) that helps elimiate outlier influence on overall model. As seen in the train data, there are numerous outliers that can impact our linear models, we will try SVM to limit the impact of residuals on the model.

set.seed(1)
svmModel <- train(x = x_train,
                  y = y_train,
                  method = "svmRadial",
                  preProc = c('center','scale'),
                  tuneLength = 10)
svmModel

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 1927 samples
##   30 predictor
## 
## Pre-processing: centered (30), scaled (30) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1927, 1927, 1927, 1927, 1927, 1927, ... 
## Resampling results across tuning parameters:
## 
##   C       RMSE       Rsquared   MAE       
##     0.25  0.1282547  0.4645443  0.09607492
##     0.50  0.1257101  0.4823713  0.09342793
##     1.00  0.1242103  0.4926859  0.09202863
##     2.00  0.1234079  0.4988076  0.09177345
##     4.00  0.1237731  0.4985127  0.09232952
##     8.00  0.1257294  0.4893784  0.09400604
##    16.00  0.1294083  0.4715981  0.09704637
##    32.00  0.1341714  0.4498762  0.10100273
##    64.00  0.1396351  0.4262176  0.10533740
##   128.00  0.1451957  0.4047201  0.10982657
## 
## Tuning parameter 'sigma' was held constant at a value of 0.02144906
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.02144906 and C = 2.

set.seed(1)
svmPred <- predict(svmModel, x_test)
## The function 'postResample' can be used to get the test set performance values
postResample(pred = svmPred, obs = y_test)

##       RMSE   Rsquared        MAE 
## 0.11996505 0.52254954 0.08585762

plot(svmModel)

plot(varImp(svmModel, scale = FALSE), top=10,scales = list(y = list(cex = 0.8)))

svm_rmse_train <- max(svmModel$results$RMSE)
svm_rsq_train <- max(svmModel$results$Rsquared)
svm_rmse_test <- postResample(pred = svmPred, obs = y_test)[1]
svm_rsq_test <- postResample(pred = svmPred, obs = y_test)[2]
svm_res <- c(svm_rmse_train,svm_rmse_test,svm_rsq_train,svm_rsq_test)

Decision trees and rules

Tree-based models consists of set of if-then rules on predictors that partition the data. There are many benefits to tree-based modeling, including high interpretability and ability to work with different type of data including outliers. Trees are also effective in selecting important variables, whereas linear models require to specify the relationships for the model.

Single tree

Basic trees partition the data into small groups that are homogenous to the response. Classification and Regression Tree (CART) methodology does this by going through each predictor and finding a value that would split the data into 2 groups such that overall sums of squared errors in each group are minimized.

set.seed(1)
cart_model <- train(x=x_train,y=y_train,
                   method='rpart2', 
                   metric='RMSE',
                   tuneLength=20,
                   trControl = ctrl)
cart_model

## CART 
## 
## 1927 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1734, 1734, 1734, 1735, 1734, 1734, ... 
## Resampling results across tuning parameters:
## 
##   maxdepth  RMSE       Rsquared   MAE      
##    1        0.1526709  0.2204539  0.1196970
##    2        0.1453394  0.2925780  0.1145968
##    3        0.1416887  0.3283476  0.1126058
##    4        0.1383008  0.3601729  0.1088833
##    5        0.1360043  0.3805625  0.1075105
##    6        0.1348800  0.3912799  0.1057709
##    7        0.1332003  0.4067103  0.1040133
##    8        0.1328575  0.4088968  0.1031772
##    9        0.1325018  0.4136135  0.1026534
##   11        0.1319700  0.4190934  0.1024201
##   12        0.1316252  0.4214934  0.1018549
##   13        0.1314065  0.4237495  0.1017249
##   14        0.1315683  0.4225134  0.1018159
##   15        0.1315683  0.4225134  0.1018159
##   16        0.1315683  0.4225134  0.1018159
##   17        0.1315683  0.4225134  0.1018159
##   18        0.1315683  0.4225134  0.1018159
##   19        0.1315683  0.4225134  0.1018159
##   20        0.1315683  0.4225134  0.1018159
##   22        0.1315683  0.4225134  0.1018159
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 13.

plot(cart_model)

set.seed(1)
predictions <- predict(cart_model,x_test)
postResample(obs = y_test,pred = predictions)

##      RMSE  Rsquared       MAE 
## 0.1330452 0.4057827 0.1015993

plot(as.party(cart_model$finalModel), gp=gpar(fontsize = 8))

cart_rmse_train <- max(cart_model$results$RMSE)
cart_rsq_train <- max(cart_model$results$Rsquared)
cart_rmse_test <- postResample(pred = predictions, obs = y_test)[1]
cart_rsq_test <- postResample(pred = predictions, obs = y_test)[2]
cart_res <- c(cart_rmse_train,cart_rmse_test,cart_rsq_train,cart_rsq_test)

Random Forest

One of the disatvantages of a single tree is that the model can become unstable if data are slightly altered. To combat the issue, bagging methods were introduced that assembled numerous unstable models and averaged predictions thus minimizing variance of predictions. In the case of trees, however, bagging methods produce dependent trees and that prevents bagging from optimally reducing variance of the predicted values. Random forest is a more effective method to ensemble trees as it incorporates randomness into the modeling process.

set.seed(1)
rf_grid<- expand.grid(mtry=c(1,3,5,7,10))
rf_model <- train(x=x_train,y=y_train,
                   method='rf', 
                   metric='RMSE',
                   importance=TRUE,
                  tuneGrid=rf_grid,
                   trControl = ctrl)
rf_model

## Random Forest 
## 
## 1927 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1734, 1734, 1734, 1735, 1734, 1734, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE       
##    1    0.1273303  0.5300828  0.09974982
##    3    0.1112161  0.6217576  0.08437119
##    5    0.1072205  0.6418152  0.08034513
##    7    0.1054417  0.6485870  0.07852594
##   10    0.1038344  0.6555277  0.07674446
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 10.

plot(rf_model)

set.seed(1)
predictions <- predict(rf_model,x_test)
postResample(obs = y_test,pred = predictions)

##       RMSE   Rsquared        MAE 
## 0.10264266 0.65386102 0.07367475

plot(varImp(rf_model, scale = FALSE), top=10,scales = list(y = list(cex = 0.8)))

rf_rmse_train <- max(rf_model$results$RMSE)
rf_rsq_train <- max(rf_model$results$Rsquared)
rf_rmse_test <- postResample(pred = predictions, obs = y_test)[1]
rf_rsq_test <- postResample(pred = predictions, obs = y_test)[2]
rf_res <- c(rf_rmse_train,rf_rmse_test,rf_rsq_train,rf_rsq_test)

Boosting

Boosting method combines week learners to produce an ensemble with a superior generalized missclassification error rate. Boosting is similar to random forests in that it combines multiple models where trees are used as base learners, however, whereas random forests produce independent trees, trees in boosting are dependent on past trees and contribute unequally to the final model.

set.seed(1)
gbmGrid <- expand.grid(interaction.depth = seq(1,5,by=1),
                       n.trees=c(10,20,50,100),
                       shrinkage=c(0.01,0.1,0.5),
                       n.minobsinnode=10)
gbm_model <- train(x=x_train,y=y_train,
                   method='gbm', 
                   metric='RMSE',
                   verbose=FALSE,
                   tuneGrid=gbmGrid,
                   trControl = ctrl)
gbm_model

## Stochastic Gradient Boosting 
## 
## 1927 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1734, 1734, 1734, 1735, 1734, 1734, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.trees  RMSE       Rsquared   MAE       
##   0.01       1                   10      0.1688129  0.2217692  0.13518121
##   0.01       1                   20      0.1660374  0.2262628  0.13272193
##   0.01       1                   50      0.1599234  0.2524418  0.12735523
##   0.01       1                  100      0.1532405  0.3051912  0.12166258
##   0.01       2                   10      0.1676254  0.2923256  0.13430472
##   0.01       2                   20      0.1638411  0.3043571  0.13121291
##   0.01       2                   50      0.1554456  0.3312181  0.12407551
##   0.01       2                  100      0.1467188  0.3629230  0.11665697
##   0.01       3                   10      0.1670259  0.3423768  0.13386526
##   0.01       3                   20      0.1626312  0.3499364  0.13025546
##   0.01       3                   50      0.1529004  0.3742273  0.12209891
##   0.01       3                  100      0.1430715  0.4033872  0.11374033
##   0.01       4                   10      0.1665633  0.3689528  0.13346675
##   0.01       4                   20      0.1617850  0.3714391  0.12953090
##   0.01       4                   50      0.1511931  0.3962902  0.12062855
##   0.01       4                  100      0.1406744  0.4279384  0.11179342
##   0.01       5                   10      0.1661956  0.3935395  0.13321408
##   0.01       5                   20      0.1612075  0.3969983  0.12907684
##   0.01       5                   50      0.1499553  0.4192725  0.11957449
##   0.01       5                  100      0.1389537  0.4471722  0.11026507
##   0.10       1                   10      0.1528095  0.3040002  0.12127403
##   0.10       1                   20      0.1449892  0.3436923  0.11459679
##   0.10       1                   50      0.1370396  0.3878114  0.10815944
##   0.10       1                  100      0.1334086  0.4138890  0.10499613
##   0.10       2                   10      0.1467252  0.3557075  0.11667273
##   0.10       2                   20      0.1375850  0.3991072  0.10910036
##   0.10       2                   50      0.1295837  0.4520172  0.10137490
##   0.10       2                  100      0.1253953  0.4807857  0.09699295
##   0.10       3                   10      0.1427417  0.3950982  0.11340360
##   0.10       3                   20      0.1335577  0.4363662  0.10553633
##   0.10       3                   50      0.1258696  0.4816762  0.09789031
##   0.10       3                  100      0.1213753  0.5123656  0.09334885
##   0.10       4                   10      0.1402517  0.4200423  0.11127212
##   0.10       4                   20      0.1313326  0.4524631  0.10344064
##   0.10       4                   50      0.1235531  0.4998932  0.09543734
##   0.10       4                  100      0.1188944  0.5300856  0.09103839
##   0.10       5                   10      0.1389605  0.4370234  0.11031496
##   0.10       5                   20      0.1291881  0.4724941  0.10135771
##   0.10       5                   50      0.1205611  0.5221670  0.09331696
##   0.10       5                  100      0.1164622  0.5487392  0.08883952
##   0.50       1                   10      0.1360153  0.3879056  0.10728819
##   0.50       1                   20      0.1337980  0.4078771  0.10451036
##   0.50       1                   50      0.1319037  0.4187612  0.10165242
##   0.50       1                  100      0.1322606  0.4179017  0.10147238
##   0.50       2                   10      0.1329346  0.4132443  0.10529131
##   0.50       2                   20      0.1297663  0.4368818  0.10084259
##   0.50       2                   50      0.1283066  0.4534849  0.09792825
##   0.50       2                  100      0.1299330  0.4466890  0.09942877
##   0.50       3                   10      0.1268463  0.4628120  0.09823843
##   0.50       3                   20      0.1258635  0.4698522  0.09622082
##   0.50       3                   50      0.1258605  0.4772883  0.09525533
##   0.50       3                  100      0.1280254  0.4736549  0.09725696
##   0.50       4                   10      0.1293416  0.4413009  0.09890308
##   0.50       4                   20      0.1287748  0.4480072  0.09683568
##   0.50       4                   50      0.1295969  0.4524839  0.09670614
##   0.50       4                  100      0.1343096  0.4359362  0.09959758
##   0.50       5                   10      0.1238703  0.4881776  0.09477197
##   0.50       5                   20      0.1232184  0.4929108  0.09337727
##   0.50       5                   50      0.1277243  0.4731577  0.09673335
##   0.50       5                  100      0.1327078  0.4562536  0.10148788
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 100, interaction.depth =
##  5, shrinkage = 0.1 and n.minobsinnode = 10.

plot(gbm_model)

set.seed(1)
predictions <- predict(gbm_model,x_test)
postResample(obs = y_test,pred = predictions)

##       RMSE   Rsquared        MAE 
## 0.11620098 0.54659749 0.08718576

plot(varImp(gbm_model, scale = FALSE), top=10,scales = list(y = list(cex = 0.8)))

gbm_rmse_train <- max(gbm_model$results$RMSE)
gbm_rsq_train <- max(gbm_model$results$Rsquared)
gbm_rmse_test <- postResample(pred = predictions, obs = y_test)[1]
gbm_rsq_test <- postResample(pred = predictions, obs = y_test)[2]
gbm_res <- c(gbm_rmse_train,gbm_rmse_test,gbm_rsq_train,gbm_rsq_test)

Cubist

Cubist is a rule-based model which is also similar to trees but unlike trees, it combines models proportionally into final model based individual RMSE, i.e., models with lower RMSE will be assigned higher weights than worse performing models.

set.seed(1)
cubGrid <- expand.grid(committees=c(1,3,5,7,10),
                       neighbors=c(1,3,5))
cub_model <- train(x=x_train,y=y_train,
                   method='cubist', 
                   metric='RMSE',
                   tuneGrid=cubGrid,
                   trControl = ctrl,
                   verbose=FALSE)
cub_model

## Cubist 
## 
## 1927 samples
##   30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1734, 1734, 1734, 1735, 1734, 1734, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE       Rsquared   MAE       
##    1          1          0.1422817  0.4856878  0.09186507
##    1          3          0.1319490  0.5072097  0.08471470
##    1          5          0.1311295  0.5035950  0.08454833
##    3          1          0.1228620  0.5548586  0.08467224
##    3          3          0.1116468  0.5981456  0.07818007
##    3          5          0.1112503  0.5964564  0.07818153
##    5          1          0.1178537  0.5756296  0.08234199
##    5          3          0.1063171  0.6262765  0.07581338
##    5          5          0.1059999  0.6257008  0.07599157
##    7          1          0.1162060  0.5850890  0.08128862
##    7          3          0.1047045  0.6360174  0.07463855
##    7          5          0.1042771  0.6367265  0.07472448
##   10          1          0.1152667  0.5891866  0.08039447
##   10          3          0.1034833  0.6432856  0.07380289
##   10          5          0.1030296  0.6448527  0.07408879
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 10 and neighbors = 5.

plot(cub_model)

set.seed(1)
predictions <- predict(cub_model,x_test)
postResample(obs = y_test,pred = predictions)

##       RMSE   Rsquared        MAE 
## 0.10620484 0.62641492 0.07473372

plot(varImp(rf_model, scale = FALSE), top=10,scales = list(y = list(cex = 0.8)))

cub_rmse_train <- max(cub_model$results$RMSE)
cub_rsq_train <- max(cub_model$results$Rsquared)
cub_rmse_test <- postResample(pred = predictions, obs = y_test)[1]
cub_rsq_test <- postResample(pred = predictions, obs = y_test)[2]
cub_res <- c(cub_rmse_train,cub_rmse_test,cub_rsq_train,cub_rsq_test)

Model Comparison

The below table summarized models by RMSE and R squared on train and test data. Random forest is the best performing followed by Cubist. Linear models performed the worst, which indicates that variables are not linearly dependent, another possible reason for that is the high number of outliers that skewed linear predictions. Although Cubist and Random Forest achieve very similar metrics, random forest is preferable due to its high interpretability. As it achieves lowest RMSE and highest variance explanation, this model is the final choice.

res <- data.frame(rbind(lm_all_res,lm_red_res,pls_res,mars_res,svm_res,cart_res,rf_res,gbm_res,cub_res))
colnames(res) <- c("RMSE_train","RMSE_test",'Rsq_train','Rsq_test')
rownames(res) <- c('Linear','Reduced Linear','PLS','MARS','SVM','Single Tree','Random Forest','Boosting','Cubist')
datatable(res) %>% formatRound(c("RMSE_train","RMSE_test",'Rsq_train','Rsq_test'),3)

Predictions on new data

evaluation <- read_excel('StudentEvaluation.xlsx')

names(evaluation)[names(evaluation) == "Brand Code"] <- "brand"
evaluation <- dummy_cols(evaluation)
evaluation <- subset(evaluation,select = -c(brand,brand_NA,PH))
str(evaluation)

## tibble [267 x 35] (S3: tbl_df/tbl/data.frame)
##  $ Carb Volume      : num [1:267] 5.48 5.39 5.29 5.27 5.41 ...
##  $ Fill Ounces      : num [1:267] 24 24 23.9 23.9 24.2 ...
##  $ PC Volume        : num [1:267] 0.27 0.227 0.303 0.186 0.16 ...
##  $ Carb Pressure    : num [1:267] 65.4 63.2 66.4 64.8 69.4 73.4 65.2 67.4 66.8 72.6 ...
##  $ Carb Temp        : num [1:267] 135 135 140 139 142 ...
##  $ PSC              : num [1:267] 0.236 0.042 0.068 0.004 0.04 0.078 0.088 0.076 0.246 0.146 ...
##  $ PSC Fill         : num [1:267] 0.4 0.22 0.1 0.2 0.3 ...
##  $ PSC CO2          : num [1:267] 0.04 0.08 0.02 0.02 0.06 ...
##  $ Mnf Flow         : num [1:267] -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 ...
##  $ Carb Pressure1   : num [1:267] 117 119 120 125 115 ...
##  $ Fill Pressure    : num [1:267] 46 46.2 45.8 40 51.4 46.4 46.2 40 43.8 40.8 ...
##  $ Hyd Pressure1    : num [1:267] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure2    : num [1:267] NA 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure3    : num [1:267] NA 0 0 0 0 0 0 0 0 0 ...
##  $ Hyd Pressure4    : num [1:267] 96 112 98 132 94 94 108 108 110 106 ...
##  $ Filler Level     : num [1:267] 129 120 119 120 116 ...
##  $ Filler Speed     : num [1:267] 3986 4012 4010 NA 4018 ...
##  $ Temperature      : num [1:267] 66 65.6 65.6 74.4 66.4 66.6 66.8 NA 65.8 66 ...
##  $ Usage cont       : num [1:267] 21.7 17.6 24.2 18.1 21.3 ...
##  $ Carb Flow        : num [1:267] 2950 2916 3056 28 3214 ...
##  $ Density          : num [1:267] 0.88 1.5 0.9 0.74 0.88 0.84 1.48 1.6 1.52 1.48 ...
##  $ MFR              : num [1:267] 728 736 735 NA 752 ...
##  $ Balling          : num [1:267] 1.4 2.94 1.45 1.06 1.4 ...
##  $ Pressure Vacuum  : num [1:267] -3.8 -4.4 -4.2 -4 -4 -3.8 -4.2 -4.4 -4.4 -4.2 ...
##  $ Oxygen Filler    : num [1:267] 0.022 0.03 0.046 NA 0.082 0.064 0.042 0.096 0.046 0.096 ...
##  $ Bowl Setpoint    : num [1:267] 130 120 120 120 120 120 120 120 120 120 ...
##  $ Pressure Setpoint: num [1:267] 45.2 46 46 46 50 46 46 46 46 46 ...
##  $ Air Pressurer    : num [1:267] 143 147 147 146 146 ...
##  $ Alch Rel         : num [1:267] 6.56 7.14 6.52 6.48 6.5 6.5 7.18 7.16 7.14 7.78 ...
##  $ Carb Rel         : num [1:267] 5.34 5.58 5.34 5.5 5.38 5.42 5.46 5.42 5.44 5.52 ...
##  $ Balling Lvl      : num [1:267] 1.48 3.04 1.46 1.48 1.46 1.44 3.02 3 3.1 3.12 ...
##  $ brand_A          : int [1:267] 0 1 0 0 0 0 1 0 1 0 ...
##  $ brand_B          : int [1:267] 0 0 1 1 1 1 0 1 0 0 ...
##  $ brand_C          : int [1:267] 0 0 0 0 0 0 0 0 0 0 ...
##  $ brand_D          : int [1:267] 1 0 0 0 0 0 0 0 0 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

evaluation[32:35] <- lapply(evaluation[32:35], as.numeric)
prep_new_data <- preProcess(evaluation, method=c('scale','center','knnImpute'))
new_data <- predict(prep_new_data,evaluation)

## Error: `i` must have one dimension, not 2.

new_data <- new_data[,-pred_to_remove]

## Error in eval(expr, envir, enclos): object 'new_data' not found

new_data <- new_data[,-tooHigh]

## Error in eval(expr, envir, enclos): object 'new_data' not found

final_predictions <- predict(rf_model,new_data)

## Error in predict.train(rf_model, new_data): object 'new_data' not found

write.csv(final_predictions,'624_final_predictions.csv')

## Error in is.data.frame(x): object 'final_predictions' not found

624_Final_Project

Albina Gallyavova, Michael Gankhuyag, Joby John

5/10/2020