Preparing Workspace

rm(list = ls())
cat("\f") 

Preparing Packages

 packages <- c("psych",
               "dplyr",
               "ggplot2",
               "corrplot",
               "readxl",
               "lmtest",
               "MASS",
               "caret"
               )
   
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i], dependencies = TRUE)
    }
    library(packages[i], character.only = TRUE)
  }
## Warning: package 'psych' was built under R version 4.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## corrplot 0.92 loaded
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: lattice
  rm(packages)

Setting WD

setwd("/Users/josephmancuso/Documents/BC/Spring'24/Econometrics/Week 4/HW")
getwd()
## [1] "/Users/josephmancuso/Documents/BC/Spring'24/Econometrics/Week 4/HW"

Part I:

Import the identified dataset (or datasets, if you need to combine a few sources to create a richer dataset), investigate each variable for: missing values, min/max/mean/stdev (if numeric) or number of distinct levels (if categorical); investigate correlations between variables as well.

Importing Dataset

my.data <- read_xlsx("/Users/josephmancuso/Documents/BC/Spring'24/Econometrics/Week 4/HW/Wine_data.xlsx")

my.data <- as.data.frame(my.data)

Background: For this assignment, I chose to analyze the white wine quality dataset available through Kaggle. The dataset contains observations related to the physiochemical properties (inputs) and quality (output) of 4898 white variants of the Portuguese “Vinho Verde” wine. All physicocehmcial variables are continuous. The quality variable is ordinal and ranks each wine on a scale from 1-10, with 10 being the highest score. My goal is to build a model which predicts the quality of each variant based on their underlying physiochemical properties.

str(my.data)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free sulfur dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total sulfur dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : num  6 6 6 6 6 6 6 6 6 6 ...

Summary Statistics

#preparing summary statistics
psych::describe(my.data)
##                      vars    n   mean    sd median trimmed   mad  min    max
## fixed acidity           1 4898   6.85  0.84   6.80    6.82  0.74 3.80  14.20
## volatile acidity        2 4898   0.28  0.10   0.26    0.27  0.09 0.08   1.10
## citric acid             3 4898   0.33  0.12   0.32    0.33  0.09 0.00   1.66
## residual sugar          4 4898   6.39  5.07   5.20    5.80  5.34 0.60  65.80
## chlorides               5 4898   0.05  0.02   0.04    0.04  0.01 0.01   0.35
## free sulfur dioxide     6 4898  35.31 17.01  34.00   34.36 16.31 2.00 289.00
## total sulfur dioxide    7 4898 138.36 42.50 134.00  136.96 43.00 9.00 440.00
## density                 8 4898   0.99  0.00   0.99    0.99  0.00 0.99   1.04
## pH                      9 4898   3.19  0.15   3.18    3.18  0.15 2.72   3.82
## sulphates              10 4898   0.49  0.11   0.47    0.48  0.10 0.22   1.08
## alcohol                11 4898  10.51  1.23  10.40   10.43  1.48 8.00  14.20
## quality                12 4898   5.88  0.89   6.00    5.85  1.48 3.00   9.00
##                       range skew kurtosis   se
## fixed acidity         10.40 0.65     2.17 0.01
## volatile acidity       1.02 1.58     5.08 0.00
## citric acid            1.66 1.28     6.16 0.00
## residual sugar        65.20 1.08     3.46 0.07
## chlorides              0.34 5.02    37.51 0.00
## free sulfur dioxide  287.00 1.41    11.45 0.24
## total sulfur dioxide 431.00 0.39     0.57 0.61
## density                0.05 0.98     9.78 0.00
## pH                     1.10 0.46     0.53 0.00
## sulphates              0.86 0.98     1.59 0.00
## alcohol                6.20 0.49    -0.70 0.02
## quality                6.00 0.16     0.21 0.01

Correlation

numeric <- my.data[, sapply(my.data, is.numeric)] #isolating numeric variables

correlation.matrix <- cor(numeric, use = "pairwise") #creating correlation matrix
corrplot(correlation.matrix, type = "lower", tl.cex = 0.7)

cor(my.data$quality, my.data$alcohol) #moderate positive correlation
## [1] 0.4355747
cor(my.data$quality, my.data$density) #moderate negative correlation
## [1] -0.3071233
cor(my.data$quality, my.data$pH) #weak positive correlation
## [1] 0.09942725
cor(my.data$quality, my.data$chlorides) #weak negative correlation
## [1] -0.2099344
cor(my.data$quality, my.data$`volatile acidity`) #weak negative correlation
## [1] -0.194723

Part II:

Fit, and comment on the goodness of fit of, a linear regression model. Use the “predict” function to estimate the fitted values for the given data points. Look for the largest error (in absolute value) - can you try to explain what “went wrong” with the model in this instance?

Fitting Model

my.lm <- lm(my.data$quality ~ ., data = my.data)
summary(my.lm)
## 
## Call:
## lm(formula = my.data$quality ~ ., data = my.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8348 -0.4934 -0.0379  0.4637  3.1143 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.502e+02  1.880e+01   7.987 1.71e-15 ***
## `fixed acidity`         6.552e-02  2.087e-02   3.139  0.00171 ** 
## `volatile acidity`     -1.863e+00  1.138e-01 -16.373  < 2e-16 ***
## `citric acid`           2.209e-02  9.577e-02   0.231  0.81759    
## `residual sugar`        8.148e-02  7.527e-03  10.825  < 2e-16 ***
## chlorides              -2.473e-01  5.465e-01  -0.452  0.65097    
## `free sulfur dioxide`   3.733e-03  8.441e-04   4.422 9.99e-06 ***
## `total sulfur dioxide` -2.857e-04  3.781e-04  -0.756  0.44979    
## density                -1.503e+02  1.907e+01  -7.879 4.04e-15 ***
## pH                      6.863e-01  1.054e-01   6.513 8.10e-11 ***
## sulphates               6.315e-01  1.004e-01   6.291 3.44e-10 ***
## alcohol                 1.935e-01  2.422e-02   7.988 1.70e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared:  0.2819, Adjusted R-squared:  0.2803 
## F-statistic: 174.3 on 11 and 4886 DF,  p-value: < 2.2e-16
par(mfrow=c(2,3))
plot(my.lm, c(1:6))

Assumtptions:

Estimating Fitted Values

predictions <- predict(my.lm, type = "response")
summary(predictions)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.130   5.550   5.863   5.878   6.225   7.250
observed <- my.data$quality
outcome.table <- table(observed, predictions)
outcome.table.accuracy <- sum(diag(outcome.table)) / sum(outcome.table)

print(outcome.table.accuracy)
## [1] 0.000204165

Determining Largest Error

errors <- abs(my.data$quality - predictions)
largest.error <- max(errors)
index <- which.max(errors)
data.point <- my.data[index,]
print(data.point)
##      fixed acidity volatile acidity citric acid residual sugar chlorides
## 4746           6.1             0.26        0.25            2.9     0.047
##      free sulfur dioxide total sulfur dioxide density   pH sulphates alcohol
## 4746                 289                  440 0.99314 3.44      0.64    10.5
##      quality
## 4746       3
z.score <- (errors[index] - mean(errors)) / sd(errors)
print(z.score)
##     4746 
## 6.891292

Interpretation: The largest error in the model is driven by observation 4,746. The observation’s z-score of 6.89 suggests the datapoint is a significant outlier which is likely driving the error and reducing the predictive power of the model.

Part III:

Fit a logistic regression for a binary variable you need to create out of a numeric one (e.g. “engine_power > 100”). Comment on the goodness of fit. Use the “predict” to estimate the probabilities associated with the positive outcome; compute the residuals. As above, look for the largest error (in absolute value) - can you try to explain what “went wrong” with the model in this instance?

Creating Binary Variable

my.data.2 <- my.data
my.data.2$quality <- ifelse(my.data.2$quality >= 6, "High", "Low")
my.data.2$quality <- as.factor(my.data.2$quality)

Fitting Model

my.logit <- glm(quality ~., data = my.data.2, family = "binomial")
summary(my.logit)
## 
## Call:
## glm(formula = quality ~ ., family = "binomial", data = my.data.2)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.582e+02  7.099e+01  -3.638 0.000275 ***
## `fixed acidity`        -3.648e-02  7.178e-02  -0.508 0.611271    
## `volatile acidity`      6.459e+00  4.128e-01  15.646  < 2e-16 ***
## `citric acid`          -1.158e-01  3.029e-01  -0.382 0.702219    
## `residual sugar`       -1.701e-01  2.704e-02  -6.291 3.16e-10 ***
## chlorides              -8.852e-01  1.671e+00  -0.530 0.596379    
## `free sulfur dioxide`  -9.601e-03  2.782e-03  -3.451 0.000560 ***
## `total sulfur dioxide`  1.333e-03  1.211e-03   1.101 0.270982    
## density                 2.709e+02  7.195e+01   3.765 0.000167 ***
## pH                     -1.090e+00  3.618e-01  -3.013 0.002590 ** 
## sulphates              -1.797e+00  3.595e-01  -5.000 5.75e-07 ***
## alcohol                -7.429e-01  9.361e-02  -7.937 2.08e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6245.4  on 4897  degrees of freedom
## Residual deviance: 4932.6  on 4886  degrees of freedom
## AIC: 4956.6
## 
## Number of Fisher Scoring iterations: 5
prediction.2 <- predict(my.logit, newdata = my.data.2, type = "response")

#defining threshold
threshold <- 0.5

#transforming predicted values into binary operators
predicted.2 <- ifelse(prediction.2 >= threshold, yes = 1, no = 0)
observed <- my.data.2$quality
outcome.table <- table(observed, predicted.2)
outcome.table
##         predicted.2
## observed    0    1
##     High 2860  398
##     Low   826  814
#determining accuracy
outcome.table.accuracy <- sum(diag(outcome.table)) / sum(outcome.table)

print(outcome.table.accuracy)
## [1] 0.7501021
errors <- abs(my.data.2$quality - prediction.2)
## Warning in Ops.factor(my.data.2$quality, prediction.2): '-' not meaningful for
## factors
largest.error <- max(errors)
index <- which.max(errors)
data.point <- my.data.2[index,]
print(data.point)
##  [1] fixed acidity        volatile acidity     citric acid         
##  [4] residual sugar       chlorides            free sulfur dioxide 
##  [7] total sulfur dioxide density              pH                  
## [10] sulphates            alcohol              quality             
## <0 rows> (or 0-length row.names)