rm(list = ls())      # Clear all files from your environment
         gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526535 28.2    1169675 62.5         NA   669420 35.8
## Vcells 971307  7.5    8388608 64.0      16384  1851915 14.2
         cat("\f")       # Clear the console
 graphics.off()      # Clear all graphs
head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

I. Logi Model

# Loading in R to analyze it
data(mtcars)

# Chose to create a binary variable for high/low mpg based on avg value
mtcars$mpg_level <- ifelse(mtcars$mpg > mean(mtcars$mpg), "high", "low")

mtcars$mpg_level <- factor(mtcars$mpg_level, 
                           levels = c("low", "high"))

# logistic regression model
logi_model <- glm(mpg_level ~ cyl + hp,  # Using # of cylinders & Gross Horsepower
                   data = mtcars, 
                   family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Display summary of the model
summary(logi_model)
## 
## Call:
## glm(formula = mpg_level ~ cyl + hp, family = "binomial", data = mtcars)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)    76.8530 20252.2940   0.004    0.997
## cyl            -9.5655  3375.3813  -0.003    0.998
## hp             -0.1721     0.1541  -1.117    0.264
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.8601  on 31  degrees of freedom
## Residual deviance:  6.6694  on 29  degrees of freedom
## AIC: 12.669
## 
## Number of Fisher Scoring iterations: 21
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
# Generate predicted probabilities, training data
predictions <- predict(logi_model, 
                       newdata = mtcars, 
                       type = "response")

# Convert predicted probabilities to binary outcomes (0 or 1) using a threshold (e.g., 0.5)
predicted_classes <- ifelse(predictions >= 0.5, 
                            yes = "high", 
                            no = "low")

# Setting factor levels for predicted classes, we want to match actual classes
predicted_classes <- factor(predicted_classes, 
                            levels = levels(mtcars$mpg_level))

# Add predicted data to data set
mtcars$predicted_classes <- predicted_classes

# Create the confusion matrix
conf_matrix <- confusionMatrix(predicted_classes, 
                               mtcars$mpg_level)

# Display the matrix
conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction low high
##       low   17    0
##       high   1   14
##                                           
##                Accuracy : 0.9688          
##                  95% CI : (0.8378, 0.9992)
##     No Information Rate : 0.5625          
##     P-Value [Acc > NIR] : 2.612e-07       
##                                           
##                   Kappa : 0.937           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9444          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9333          
##              Prevalence : 0.5625          
##          Detection Rate : 0.5312          
##    Detection Prevalence : 0.5312          
##       Balanced Accuracy : 0.9722          
##                                           
##        'Positive' Class : low             
## 

Interpretation

Our model is showing high accuracy (0.97) along with strong levels of sensitivity (0.94) and specificity (1.00)

Coefficients below:

  • Intercept: Represents the log-odds of the ‘high’ mpg category when the predictor variables are 0.

  • Cylinder: For every one-unit increase in the number of cylinders, the log-odds of a car being in the “high” mpg category decreases by 9.57.

  • Horsepower: For every one-unit increase in horsepower, the log-odds of a car being in the “high” mpg category decreases by 0.17

Both of these variables decrease the likelihood of the car falling into the “high” mpg category.

Why not a good idea to run a multivariate regression

In situation like this where we only have two main categorical outcomes (low and high), it is best to run a logistical model instead of multivariate regression. Multivariate regressions on the other hand are best equipped for continuous variables which exhibit linear relationships. Categorical variables on the other hand often have non-linear relationships and may not exhibit a linear trend, thus making inappropriate regression predictions.

Box Cox

# multivariate regression model
multivariate_model <- lm(Ozone ~ Solar.R + Wind, 
                         data = airquality)

par(mfrow = c(2, 2))
plot(multivariate_model)

library(car)
## Loading required package: carData
library(MASS)

# Use boxcox to estimate Box-Cox transformation
boxcox_result <- boxcox(multivariate_model, 
                        lambda = seq(-3, 3))

As we can see, the lambda value for the Box-Cox transformation is near 0 and thus a logarithmic transformation might be more suitable instead. Based on the graph we peak around 0.3, i added it to the model below. You can see there are small improvement across the 4 graphs.

# plot the boxcox, 0.3 from graph
boxcox_model <- lm(Ozone^0.3 ~ Solar.R + Wind, 
                         data = airquality)

par(mfrow = c(2, 2))
plot(boxcox_model)

II. Reflection

My background in coding is fairly limited, so starting this class made me a little nervous. It took me a bit of time to get an understanding of the different functions and their versatility (e.g., ggplot, summary, and tidyverse). While I do not consider myself an expert by any means, I do feel much more prepared to get into some of the more complex classes now, such as econometrics. If possible i would like to take it with you Professor Sharma; I really enjoyed your class and the overall structure with the discussion posts and homework assignments. I am a very hands on individual, and these types of assignments help me a lot.

I want to continue learning more about multivariate regression and understanding how to create predictive models. I might be within a small percentage of students in this degree where being a data analyst is not my career goal. That being said, I aim to understand these models to the best of my ability so I can use them to make business decisions at work in evaluating our financial risk management. This class has helped me fill a lot of knowledge gaps on how to structure multivariate regression and logistic models and how to analyze and test them.