Preparing Workspace
rm(list = ls())
cat("\f")
Preparing Packages
packages <- c("psych",
"dplyr",
"ggplot2",
"corrplot",
"readxl",
"lmtest",
"MASS",
"caret"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i], dependencies = TRUE)
}
library(packages[i], character.only = TRUE)
}
## Warning: package 'psych' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## corrplot 0.92 loaded
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: lattice
rm(packages)
Setting WD
setwd("/Users/josephmancuso/Documents/BC/Spring'24/Econometrics/Week 4/HW")
getwd()
## [1] "/Users/josephmancuso/Documents/BC/Spring'24/Econometrics/Week 4/HW"
Import the identified dataset (or datasets, if you need to combine a few sources to create a richer dataset), investigate each variable for: missing values, min/max/mean/stdev (if numeric) or number of distinct levels (if categorical); investigate correlations between variables as well.
Importing Dataset
my.data <- read_xlsx("/Users/josephmancuso/Documents/BC/Spring'24/Econometrics/Week 4/HW/Wine_data.xlsx")
my.data <- as.data.frame(my.data)
Background: For this assignment, I chose to analyze the white wine quality dataset available through Kaggle. The dataset contains observations related to the physiochemical properties (inputs) and quality (output) of 4898 white variants of the Portuguese “Vinho Verde” wine. All physicocehmcial variables are continuous. The quality variable is ordinal and ranks each wine on a scale from 1-10, with 10 being the highest score. My goal is to build a model which predicts the quality of each variant based on their underlying physiochemical properties.
str(my.data)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free sulfur dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total sulfur dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : num 6 6 6 6 6 6 6 6 6 6 ...
Summary Statistics
#preparing summary statistics
psych::describe(my.data)
## vars n mean sd median trimmed mad min max
## fixed acidity 1 4898 6.85 0.84 6.80 6.82 0.74 3.80 14.20
## volatile acidity 2 4898 0.28 0.10 0.26 0.27 0.09 0.08 1.10
## citric acid 3 4898 0.33 0.12 0.32 0.33 0.09 0.00 1.66
## residual sugar 4 4898 6.39 5.07 5.20 5.80 5.34 0.60 65.80
## chlorides 5 4898 0.05 0.02 0.04 0.04 0.01 0.01 0.35
## free sulfur dioxide 6 4898 35.31 17.01 34.00 34.36 16.31 2.00 289.00
## total sulfur dioxide 7 4898 138.36 42.50 134.00 136.96 43.00 9.00 440.00
## density 8 4898 0.99 0.00 0.99 0.99 0.00 0.99 1.04
## pH 9 4898 3.19 0.15 3.18 3.18 0.15 2.72 3.82
## sulphates 10 4898 0.49 0.11 0.47 0.48 0.10 0.22 1.08
## alcohol 11 4898 10.51 1.23 10.40 10.43 1.48 8.00 14.20
## quality 12 4898 5.88 0.89 6.00 5.85 1.48 3.00 9.00
## range skew kurtosis se
## fixed acidity 10.40 0.65 2.17 0.01
## volatile acidity 1.02 1.58 5.08 0.00
## citric acid 1.66 1.28 6.16 0.00
## residual sugar 65.20 1.08 3.46 0.07
## chlorides 0.34 5.02 37.51 0.00
## free sulfur dioxide 287.00 1.41 11.45 0.24
## total sulfur dioxide 431.00 0.39 0.57 0.61
## density 0.05 0.98 9.78 0.00
## pH 1.10 0.46 0.53 0.00
## sulphates 0.86 0.98 1.59 0.00
## alcohol 6.20 0.49 -0.70 0.02
## quality 6.00 0.16 0.21 0.01
Correlation
numeric <- my.data[, sapply(my.data, is.numeric)] #isolating numeric variables
correlation.matrix <- cor(numeric, use = "pairwise") #creating correlation matrix
corrplot(correlation.matrix, type = "lower", tl.cex = 0.7)
cor(my.data$quality, my.data$alcohol) #moderate positive correlation
## [1] 0.4355747
cor(my.data$quality, my.data$density) #moderate negative correlation
## [1] -0.3071233
cor(my.data$quality, my.data$pH) #weak positive correlation
## [1] 0.09942725
cor(my.data$quality, my.data$chlorides) #weak negative correlation
## [1] -0.2099344
cor(my.data$quality, my.data$`volatile acidity`) #weak negative correlation
## [1] -0.194723
Fit, and comment on the goodness of fit of, a linear regression model. Use the “predict” function to estimate the fitted values for the given data points. Look for the largest error (in absolute value) - can you try to explain what “went wrong” with the model in this instance?
Fitting Model
my.lm <- lm(my.data$quality ~ ., data = my.data)
summary(my.lm)
##
## Call:
## lm(formula = my.data$quality ~ ., data = my.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8348 -0.4934 -0.0379 0.4637 3.1143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.502e+02 1.880e+01 7.987 1.71e-15 ***
## `fixed acidity` 6.552e-02 2.087e-02 3.139 0.00171 **
## `volatile acidity` -1.863e+00 1.138e-01 -16.373 < 2e-16 ***
## `citric acid` 2.209e-02 9.577e-02 0.231 0.81759
## `residual sugar` 8.148e-02 7.527e-03 10.825 < 2e-16 ***
## chlorides -2.473e-01 5.465e-01 -0.452 0.65097
## `free sulfur dioxide` 3.733e-03 8.441e-04 4.422 9.99e-06 ***
## `total sulfur dioxide` -2.857e-04 3.781e-04 -0.756 0.44979
## density -1.503e+02 1.907e+01 -7.879 4.04e-15 ***
## pH 6.863e-01 1.054e-01 6.513 8.10e-11 ***
## sulphates 6.315e-01 1.004e-01 6.291 3.44e-10 ***
## alcohol 1.935e-01 2.422e-02 7.988 1.70e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7514 on 4886 degrees of freedom
## Multiple R-squared: 0.2819, Adjusted R-squared: 0.2803
## F-statistic: 174.3 on 11 and 4886 DF, p-value: < 2.2e-16
par(mfrow=c(2,3))
plot(my.lm, c(1:6))
Assumtptions:
Homoscedasticity: The residuals appear to be exhibiting a consistent downward loping “line” pattern which indicates heteroscedasticity may be present in the model. The Breusch-Pagan tests provides further evidence that the residuals are heteroskedastic - at a significance level of .05, we have sufficient evidence to reject the null hypothesis.
bptest(my.lm)
##
## studentized Breusch-Pagan test
##
## data: my.lm
## BP = 93.946, df = 11, p-value = 2.8e-15Normality of Residuals: The model suggests that the residuals are demonstrating non-normal characteristics as the observations slightly deviate from the line, and curve off in the extremities. To further test whether the assumption of normality of residuals holds, I performed a Shapiro-Wilk normality test in conjunction with the Q-Q plot. The results suggest we have strong evidence that the residuals are not normally distributed. The result raises questions around the reliability of our confidence intervals and p-values.
shapiro.test(residuals (my.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(my.lm)
## W = 0.98937, p-value < 2.2e-16Linearity: The absence of equally, and randomly spread residuals around the horizontal line indicates there may be a non-linear relationship between the output variable (quality) and predictors (physiochemical properties)
plot(residuals(my.lm) ~ fitted(my.lm))
Independence: The Durbin-Watson test statistic and small p-value suggest that autocorrelation is present in the model.
#insufficient evidence to reject null
dwtest(my.data$quality ~ ., data = my.data)
##
## Durbin-Watson test
##
## data: my.data$quality ~ .
## DW = 1.6206, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0Estimating Fitted Values
predictions <- predict(my.lm, type = "response")
summary(predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.130 5.550 5.863 5.878 6.225 7.250
observed <- my.data$quality
outcome.table <- table(observed, predictions)
outcome.table.accuracy <- sum(diag(outcome.table)) / sum(outcome.table)
print(outcome.table.accuracy)
## [1] 0.000204165
Determining Largest Error
errors <- abs(my.data$quality - predictions)
largest.error <- max(errors)
index <- which.max(errors)
data.point <- my.data[index,]
print(data.point)
## fixed acidity volatile acidity citric acid residual sugar chlorides
## 4746 6.1 0.26 0.25 2.9 0.047
## free sulfur dioxide total sulfur dioxide density pH sulphates alcohol
## 4746 289 440 0.99314 3.44 0.64 10.5
## quality
## 4746 3
z.score <- (errors[index] - mean(errors)) / sd(errors)
print(z.score)
## 4746
## 6.891292
Interpretation: The largest error in the model is driven by observation 4,746. The observation’s z-score of 6.89 suggests the datapoint is a significant outlier which is likely driving the error and reducing the predictive power of the model.
Fit a logistic regression for a binary variable you need to create out of a numeric one (e.g. “engine_power > 100”). Comment on the goodness of fit. Use the “predict” to estimate the probabilities associated with the positive outcome; compute the residuals. As above, look for the largest error (in absolute value) - can you try to explain what “went wrong” with the model in this instance?
Creating Binary Variable
my.data.2 <- my.data
my.data.2$quality <- ifelse(my.data.2$quality >= 6, "High", "Low")
my.data.2$quality <- as.factor(my.data.2$quality)
Fitting Model
my.logit <- glm(quality ~., data = my.data.2, family = "binomial")
summary(my.logit)
##
## Call:
## glm(formula = quality ~ ., family = "binomial", data = my.data.2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.582e+02 7.099e+01 -3.638 0.000275 ***
## `fixed acidity` -3.648e-02 7.178e-02 -0.508 0.611271
## `volatile acidity` 6.459e+00 4.128e-01 15.646 < 2e-16 ***
## `citric acid` -1.158e-01 3.029e-01 -0.382 0.702219
## `residual sugar` -1.701e-01 2.704e-02 -6.291 3.16e-10 ***
## chlorides -8.852e-01 1.671e+00 -0.530 0.596379
## `free sulfur dioxide` -9.601e-03 2.782e-03 -3.451 0.000560 ***
## `total sulfur dioxide` 1.333e-03 1.211e-03 1.101 0.270982
## density 2.709e+02 7.195e+01 3.765 0.000167 ***
## pH -1.090e+00 3.618e-01 -3.013 0.002590 **
## sulphates -1.797e+00 3.595e-01 -5.000 5.75e-07 ***
## alcohol -7.429e-01 9.361e-02 -7.937 2.08e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6245.4 on 4897 degrees of freedom
## Residual deviance: 4932.6 on 4886 degrees of freedom
## AIC: 4956.6
##
## Number of Fisher Scoring iterations: 5
prediction.2 <- predict(my.logit, newdata = my.data.2, type = "response")
#defining threshold
threshold <- 0.5
#transforming predicted values into binary operators
predicted.2 <- ifelse(prediction.2 >= threshold, yes = 1, no = 0)
observed <- my.data.2$quality
outcome.table <- table(observed, predicted.2)
outcome.table
## predicted.2
## observed 0 1
## High 2860 398
## Low 826 814
#determining accuracy
outcome.table.accuracy <- sum(diag(outcome.table)) / sum(outcome.table)
print(outcome.table.accuracy)
## [1] 0.7501021
errors <- abs(my.data.2$quality - prediction.2)
## Warning in Ops.factor(my.data.2$quality, prediction.2): '-' not meaningful for
## factors
largest.error <- max(errors)
index <- which.max(errors)
data.point <- my.data.2[index,]
print(data.point)
## [1] fixed acidity volatile acidity citric acid
## [4] residual sugar chlorides free sulfur dioxide
## [7] total sulfur dioxide density pH
## [10] sulphates alcohol quality
## <0 rows> (or 0-length row.names)