options(repos = c(CRAN = "https://cloud.r-project.org"))
mydata <- read.csv("./student_data.csv", header=TRUE, sep=";", dec=",")
head(mydata)
## Marital.status Application.mode Application.order Course evening.attendance
## 1 1 8 5 2 1
## 2 1 6 1 11 1
## 3 1 1 5 5 1
## 4 1 8 2 15 1
## 5 2 12 1 3 0
## 6 2 12 1 17 0
## Previous.qualification Nacionality Mother.s.qualification
## 1 1 1 13
## 2 1 1 1
## 3 1 1 22
## 4 1 1 23
## 5 1 1 22
## 6 12 1 22
## Father.s.qualification Mother.s.occupation Father.s.occupation Displaced
## 1 10 6 10 1
## 2 3 4 4 1
## 3 27 10 10 1
## 4 27 6 4 1
## 5 28 10 10 0
## 6 27 10 8 0
## Educational.special.needs Debtor Tuition.fees.up.to.date Gender
## 1 0 0 1 1
## 2 0 0 0 1
## 3 0 0 0 1
## 4 0 0 1 0
## 5 0 0 1 0
## 6 0 1 1 1
## Scholarship.holder Age.at.enrollment International
## 1 0 20 0
## 2 0 19 0
## 3 0 19 0
## 4 0 20 0
## 5 0 45 0
## 6 0 50 0
## Curricular.units.1st.sem..credited. Curricular.units.1st.sem..enrolled.
## 1 0 0
## 2 0 6
## 3 0 6
## 4 0 6
## 5 0 6
## 6 0 5
## Curricular.units.1st.sem..evaluations. Curricular.units.1st.sem..approved.
## 1 0 0
## 2 6 6
## 3 0 0
## 4 8 6
## 5 9 5
## 6 10 5
## Curricular.units.1st.sem..grade.
## 1 0.0
## 2 14.0
## 3 0.0
## 4 13.428571428571429
## 5 12.333333333333334
## 6 11.857142857142858
## Curricular.units.1st.sem..without.evaluations.
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## Curricular.units.2nd.sem..credited. Curricular.units.2nd.sem..enrolled.
## 1 0 0
## 2 0 6
## 3 0 6
## 4 0 6
## 5 0 6
## 6 0 5
## Curricular.units.2nd.sem..evaluations. Curricular.units.2nd.sem..approved.
## 1 0 0
## 2 6 6
## 3 0 0
## 4 10 5
## 5 6 6
## 6 17 5
## Curricular.units.2nd.sem..grade.
## 1 0.0
## 2 13.666666666666666
## 3 0.0
## 4 12.4
## 5 13.0
## 6 11.5
## Curricular.units.2nd.sem..without.evaluations. Unemployment.rate
## 1 0 10.8
## 2 0 13.9
## 3 0 10.8
## 4 0 9.4
## 5 0 13.9
## 6 5 16.2
## Inflation.rate GDP Output
## 1 1.4 1.74 Dropout
## 2 -0.3 0.79 Graduate
## 3 1.4 1.74 Dropout
## 4 -0.8 -3.12 Graduate
## 5 -0.3 0.79 Graduate
## 6 0.3 -0.92 Graduate
##PART 1 - CORRELATION ANALYSIS #ASSOSIATION BETWEEN TWO CATEGORICAL VARIABLES
Source of data:https://www.kaggle.com/datasets/marouandaghmoumi/dropout-and-success-student-data-analysis/data
#Selecting variables for analysis
selected_columns <- c("Gender", "Scholarship.holder")
mydataa <- mydata[selected_columns]
head(mydataa)
## Gender Scholarship.holder
## 1 1 0
## 2 1 0
## 3 1 0
## 4 0 0
## 5 0 0
## 6 1 0
Description: Gender: binary variable (0 -female; 1- male) Scholarship.holder: Whether the person holds a scholarship (0 -no; 1 -yes)
##Factoring
#Factoring
mydataa$GenderFactor <- factor(mydataa$Gender,
levels = c(0,1),
labels = c("female", "male"))
mydataa$Scholarship.holderFactor <- factor(mydataa$Scholarship.holder,
levels = c(0,1),
labels = c("no","yes"))
head(mydataa)
## Gender Scholarship.holder GenderFactor Scholarship.holderFactor
## 1 1 0 male no
## 2 1 0 male no
## 3 1 0 male no
## 4 0 0 female no
## 5 0 0 female no
## 6 1 0 male no
#chi-squared test
results <- chisq.test(mydataa$GenderFactor, mydataa$Scholarship.holderFactor, correct = TRUE)
results
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mydataa$GenderFactor and mydataa$Scholarship.holderFactor
## X-squared = 125.99, df = 1, p-value < 2.2e-16
We reject the null hypothesis at p<0.001 indicating an association between the Gender and Scholarship.holder
#observed frequencies
addmargins(results$observed)
##
## mydataa$GenderFactor no yes Sum
## female 2001 867 2868
## male 1324 232 1556
## Sum 3325 1099 4424
#expected frequencies
round(results$expected, 2)
##
## mydataa$GenderFactor no yes
## female 2155.54 712.46
## male 1169.46 386.54
#difference between the observed and expected frequencies
round(results$res, 2)
##
## mydataa$GenderFactor no yes
## female -3.33 5.79
## male 4.52 -7.86
The standardized residuals are outside of +-3.29. meaning the deviation is statistically significant at a=0.001
#distribution of observations in terms of proportions
addmargins(round(prop.table(results$observed),3))
##
## mydataa$GenderFactor no yes Sum
## female 0.452 0.196 0.648
## male 0.299 0.052 0.351
## Sum 0.751 0.248 0.999
#distribution of scholarship status within each gender category
addmargins(round(prop.table(results$observed, 1),3),2)
##
## mydataa$GenderFactor no yes Sum
## female 0.698 0.302 1.000
## male 0.851 0.149 1.000
#distribution of gender within each scholarship status category
addmargins(round(prop.table(results$observed, 2),3),1)
##
## mydataa$GenderFactor no yes
## female 0.602 0.789
## male 0.398 0.211
## Sum 1.000 1.000
#preform Cramer's v test
install.packages("effectsize")
## Installing package into 'C:/Users/pedro/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'effectsize' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\pedro\AppData\Local\Temp\RtmpykYBzT\downloaded_packages
library(effectsize)
effectsize::cramers_v(mydataa$GenderFactor, mydataa$Scholarship.holderFactor)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.17 | [0.14, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
the Cramér’s V of 0.17 suggests a SMALL association between Gender and Scholarship.
#2x2 table with the observed results
observed_table <- table(mydataa$GenderFactor, mydataa$Scholarship.holderFactor)
#odds for each category
odds_male <- observed_table["male", "yes"] / observed_table["male", "no"]
odds_female <- observed_table["female", "yes"] / observed_table["female", "no"]
#odds ratio
odds_ratio <- odds_male / odds_female
#results
print(observed_table)
##
## no yes
## female 2001 867
## male 1324 232
print(paste("Odds[Scholarship_yes|Female:", round(odds_female, 3)))
## [1] "Odds[Scholarship_yes|Female: 0.433"
print(paste("Odds[Scholarship_yes|Male:", round(odds_male, 3)))
## [1] "Odds[Scholarship_yes|Male: 0.175"
print(paste("Odds Ratio:", round(odds_ratio, 3)))
## [1] "Odds Ratio: 0.404"
fisher.test(mydataa$GenderFactor, mydataa$Scholarship.holderFactor)
##
## Fisher's Exact Test for Count Data
##
## data: mydataa$GenderFactor and mydataa$Scholarship.holderFactor
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.3428866 0.4759461
## sample estimates:
## odds ratio
## 0.404492
##Interpretation of the results
Probabilities and Odds -the probability of getting a scholarship if the person is a female is 0.433 times the probability of not getting a scholarship (if the person is a female). -the probability of getting a scholarship if the person is a male is 0.175 times the probability of not getting a scholarship (if the person is a male). it is less likely
The odds that a person will get a scholarship if it is a female is 0.404 larger than the odds if it is a male.
non parametric - Fisher’s test H0: variables are not assossiated H1: variables are assossiated
We reject H0 (p<0.001), meaning variables are assossiated.
#PART 2 - REGRESSION ANALYSIS
source of data:https://www.kaggle.com/code/sudharshan1729/wine-quality-prediction
mydataWine <- read.csv("./winequality-white.csv", header=TRUE, sep=";", dec=".")
head(mydataWine, 10)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.0 0.27 0.36 20.7 0.045
## 2 6.3 0.30 0.34 1.6 0.049
## 3 8.1 0.28 0.40 6.9 0.050
## 4 7.2 0.23 0.32 8.5 0.058
## 5 7.2 0.23 0.32 8.5 0.058
## 6 8.1 0.28 0.40 6.9 0.050
## 7 6.2 0.32 0.16 7.0 0.045
## 8 7.0 0.27 0.36 20.7 0.045
## 9 6.3 0.30 0.34 1.6 0.049
## 10 8.1 0.22 0.43 1.5 0.044
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 45 170 1.0010 3.00 0.45 8.8
## 2 14 132 0.9940 3.30 0.49 9.5
## 3 30 97 0.9951 3.26 0.44 10.1
## 4 47 186 0.9956 3.19 0.40 9.9
## 5 47 186 0.9956 3.19 0.40 9.9
## 6 30 97 0.9951 3.26 0.44 10.1
## 7 30 136 0.9949 3.18 0.47 9.6
## 8 45 170 1.0010 3.00 0.45 8.8
## 9 14 132 0.9940 3.30 0.49 9.5
## 10 28 129 0.9938 3.22 0.45 11.0
## quality
## 1 6
## 2 6
## 3 6
## 4 6
## 5 6
## 6 6
## 7 6
## 8 6
## 9 6
## 10 6
Description: -fixed acidity -volatile acidity -citric acid -residual sugar -chlorides -free sulfur dioxide -total sulfur dioxide -density -pH -sulphates -alcohol (all the above are measures in their specific unit) -quality: numeric variable from 0 to 10
install.packages("car")
## Installing package into 'C:/Users/pedro/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\pedro\AppData\Local\Temp\RtmpykYBzT\downloaded_packages
library(car)
## Loading required package: carData
scatterplotMatrix(mydataWine[ ,c(-1, -7)], smooth = FALSE)
install.packages("Hmisc")
## Installing package into 'C:/Users/pedro/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'Hmisc' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\pedro\AppData\Local\Temp\RtmpykYBzT\downloaded_packages
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(mydataWine))#[ ,c(-1, -7)]))
## fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity 1.00 -0.02 0.29 0.09
## volatile.acidity -0.02 1.00 -0.15 0.06
## citric.acid 0.29 -0.15 1.00 0.09
## residual.sugar 0.09 0.06 0.09 1.00
## chlorides 0.02 0.07 0.11 0.09
## free.sulfur.dioxide -0.05 -0.10 0.09 0.30
## total.sulfur.dioxide 0.09 0.09 0.12 0.40
## density 0.27 0.03 0.15 0.84
## pH -0.43 -0.03 -0.16 -0.19
## sulphates -0.02 -0.04 0.06 -0.03
## alcohol -0.12 0.07 -0.08 -0.45
## quality -0.11 -0.19 -0.01 -0.10
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## fixed.acidity 0.02 -0.05 0.09 0.27
## volatile.acidity 0.07 -0.10 0.09 0.03
## citric.acid 0.11 0.09 0.12 0.15
## residual.sugar 0.09 0.30 0.40 0.84
## chlorides 1.00 0.10 0.20 0.26
## free.sulfur.dioxide 0.10 1.00 0.62 0.29
## total.sulfur.dioxide 0.20 0.62 1.00 0.53
## density 0.26 0.29 0.53 1.00
## pH -0.09 0.00 0.00 -0.09
## sulphates 0.02 0.06 0.13 0.07
## alcohol -0.36 -0.25 -0.45 -0.78
## quality -0.21 0.01 -0.17 -0.31
## pH sulphates alcohol quality
## fixed.acidity -0.43 -0.02 -0.12 -0.11
## volatile.acidity -0.03 -0.04 0.07 -0.19
## citric.acid -0.16 0.06 -0.08 -0.01
## residual.sugar -0.19 -0.03 -0.45 -0.10
## chlorides -0.09 0.02 -0.36 -0.21
## free.sulfur.dioxide 0.00 0.06 -0.25 0.01
## total.sulfur.dioxide 0.00 0.13 -0.45 -0.17
## density -0.09 0.07 -0.78 -0.31
## pH 1.00 0.16 0.12 0.10
## sulphates 0.16 1.00 -0.02 0.05
## alcohol 0.12 -0.02 1.00 0.44
## quality 0.10 0.05 0.44 1.00
##
## n= 4898
##
##
## P
## fixed.acidity volatile.acidity citric.acid residual.sugar
## fixed.acidity 0.1122 0.0000 0.0000
## volatile.acidity 0.1122 0.0000 0.0000
## citric.acid 0.0000 0.0000 0.0000
## residual.sugar 0.0000 0.0000 0.0000
## chlorides 0.1062 0.0000 0.0000 0.0000
## free.sulfur.dioxide 0.0005 0.0000 0.0000 0.0000
## total.sulfur.dioxide 0.0000 0.0000 0.0000 0.0000
## density 0.0000 0.0578 0.0000 0.0000
## pH 0.0000 0.0255 0.0000 0.0000
## sulphates 0.2303 0.0124 0.0000 0.0620
## alcohol 0.0000 0.0000 0.0000 0.0000
## quality 0.0000 0.0000 0.5193 0.0000
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## fixed.acidity 0.1062 0.0005 0.0000 0.0000
## volatile.acidity 0.0000 0.0000 0.0000 0.0578
## citric.acid 0.0000 0.0000 0.0000 0.0000
## residual.sugar 0.0000 0.0000 0.0000 0.0000
## chlorides 0.0000 0.0000 0.0000
## free.sulfur.dioxide 0.0000 0.0000 0.0000
## total.sulfur.dioxide 0.0000 0.0000 0.0000
## density 0.0000 0.0000 0.0000
## pH 0.0000 0.9655 0.8710 0.0000
## sulphates 0.2408 0.0000 0.0000 0.0000
## alcohol 0.0000 0.0000 0.0000 0.0000
## quality 0.0000 0.5681 0.0000 0.0000
## pH sulphates alcohol quality
## fixed.acidity 0.0000 0.2303 0.0000 0.0000
## volatile.acidity 0.0255 0.0124 0.0000 0.0000
## citric.acid 0.0000 0.0000 0.0000 0.5193
## residual.sugar 0.0000 0.0620 0.0000 0.0000
## chlorides 0.0000 0.2408 0.0000 0.0000
## free.sulfur.dioxide 0.9655 0.0000 0.0000 0.5681
## total.sulfur.dioxide 0.8710 0.0000 0.0000 0.0000
## density 0.0000 0.0000 0.0000 0.0000
## pH 0.0000 0.0000 0.0000
## sulphates 0.0000 0.2225 0.0002
## alcohol 0.0000 0.2225 0.0000
## quality 0.0000 0.0002 0.0000
Correlation between variables - Here we can see a strong positive correlation between density and residual.sugar (0.84) - also a strong negative correlation between density and alcohol ( -0.78)
To simplify the model we chose, intuitively, the following 6 explanatory variables: -fixed.acidity + citric.acid + residual.sugar + density + pH + alcohol
fit1 <- lm(quality ~ fixed.acidity + citric.acid + residual.sugar + density + pH + alcohol, data = mydataWine)
vif(fit1)
## fixed.acidity citric.acid residual.sugar density pH
## 2.490907 1.106604 10.728665 23.534465 2.086130
## alcohol
## 6.893110
Variance Inflation Factor Vif should be less then 5 - we chose to remove density
fit2 <- lm(quality ~ fixed.acidity + citric.acid + residual.sugar + pH + alcohol, data = mydataWine)
vif(fit2)
## fixed.acidity citric.acid residual.sugar pH alcohol
## 1.307923 1.098427 1.291403 1.260925 1.265443
Variance Inflation Factor All Vif values are below 5. We accept this model for there is not to strong multicolinearity.
summary(fit2)
##
## Call:
## lm(formula = quality ~ fixed.acidity + citric.acid + residual.sugar +
## pH + alcohol, data = mydataWine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6155 -0.5370 0.0039 0.4765 3.1644
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.481289 0.347986 4.257 2.11e-05 ***
## fixed.acidity -0.060651 0.015258 -3.975 7.14e-05 ***
## citric.acid 0.290423 0.097502 2.979 0.002909 **
## residual.sugar 0.023008 0.002522 9.121 < 2e-16 ***
## pH 0.281428 0.083724 3.361 0.000781 ***
## alcohol 0.349144 0.010292 33.925 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7879 on 4892 degrees of freedom
## Multiple R-squared: 0.2094, Adjusted R-squared: 0.2086
## F-statistic: 259.2 on 5 and 4892 DF, p-value: < 2.2e-16
20.94% of the variability of the quality of white wines is explained by linear effect of Fixed Acidity, Citric Acid, Residual Sugar, pH and Alcohol levels. This value is very low because we did not include all the explanatory variables in an atempt to simplify the model.
We proceed to do a model with all explanatory variables.
fit3 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density + pH + sulphates + alcohol, data = mydataWine)
vif(fit3)
## fixed.acidity volatile.acidity citric.acid
## 2.691435 1.141156 1.165215
## residual.sugar chlorides free.sulfur.dioxide
## 12.644064 1.236822 1.787880
## total.sulfur.dioxide density pH
## 2.239233 28.232546 2.196362
## sulphates alcohol
## 1.138540 7.706957
Included variables: (all) fixed.acidity volatile.acidity
citric.acid residual.sugar chlorides free.sulfur.dioxide
total.sulfur.dioxide density
pH sulphates alcohol
Variance Inflation Factor - Vif should be less then 5 - we chose to remove density
fit4 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + pH + sulphates + alcohol, data = mydataWine)
vif(fit4)
## fixed.acidity volatile.acidity citric.acid
## 1.356128 1.128298 1.159884
## residual.sugar chlorides free.sulfur.dioxide
## 1.435215 1.203645 1.744627
## total.sulfur.dioxide pH sulphates
## 2.153170 1.330912 1.056637
## alcohol
## 1.647117
Variance Inflation Factor -All Vif values are below 5. We accept this model for there is not to strong multicolinearity.
mydataWine$StdResid <- round(rstandard(fit4),3)
mydataWine$CooksD <- round(cooks.distance(fit4),3)
hist(mydataWine$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydataWine$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydataWine$StdResid
## W = 0.98956, p-value < 2.2e-16
Normality test We reject H0 (erros are normal distribuited), at p<0.001. The erros are not normally distribuited, however, as the sample is so big (>30) we can proceed with the analysis.
hist(mydataWine$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "histogram of cooks distances")
head(mydataWine[order(mydataWine$StdResid),],30)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 4746 6.1 0.260 0.25 2.90 0.047
## 3308 9.4 0.240 0.29 8.50 0.037
## 741 6.9 0.390 0.40 4.60 0.022
## 254 5.8 0.240 0.44 3.50 0.029
## 446 7.1 0.320 0.32 11.00 0.038
## 3811 6.8 0.260 0.34 15.10 0.060
## 1230 8.3 0.330 0.42 1.15 0.033
## 3410 6.2 0.230 0.35 0.70 0.051
## 3088 6.1 0.200 0.34 9.50 0.041
## 1689 6.7 0.250 0.26 1.55 0.041
## 1932 7.1 0.490 0.22 2.00 0.047
## 3737 6.5 0.320 0.48 8.00 0.026
## 252 8.5 0.260 0.21 16.20 0.074
## 1418 8.6 0.550 0.35 15.55 0.057
## 2051 11.8 0.230 0.38 11.10 0.034
## 177 7.2 0.320 0.47 5.10 0.044
## 527 7.2 0.310 0.46 5.00 0.040
## 832 7.4 0.170 0.40 5.50 0.037
## 4509 5.8 0.260 0.30 2.60 0.034
## 2160 6.3 0.240 0.37 1.80 0.031
## 3266 4.2 0.215 0.23 5.10 0.041
## 874 10.3 0.170 0.47 1.40 0.037
## 3771 7.4 0.270 0.31 2.40 0.014
## 2117 6.8 0.200 0.27 1.20 0.034
## 4481 5.9 0.220 0.45 22.60 0.120
## 2374 7.6 0.480 0.37 1.20 0.034
## 3902 4.8 0.650 0.12 1.10 0.013
## 541 6.7 0.310 0.31 9.90 0.040
## 4805 6.0 0.350 0.46 0.90 0.033
## 1485 7.5 0.320 0.24 4.60 0.053
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 4746 289.0 440.0 0.99314 3.44 0.64 10.5
## 3308 124.0 208.0 0.99395 2.90 0.38 11.0
## 741 5.0 19.0 0.99150 3.31 0.37 12.6
## 254 5.0 109.0 0.99130 3.53 0.43 11.7
## 446 16.0 66.0 0.99370 3.24 0.40 11.5
## 3811 42.0 162.0 0.99705 3.24 0.52 10.5
## 1230 18.0 96.0 0.99110 3.20 0.32 12.4
## 3410 24.0 111.0 0.99160 3.37 0.43 11.0
## 3088 38.0 201.0 0.99500 3.14 0.44 10.1
## 1689 118.5 216.0 0.99490 3.55 0.63 9.4
## 1932 146.5 307.5 0.99240 3.24 0.37 11.0
## 3737 18.0 88.0 0.99144 3.22 0.79 12.7
## 252 41.0 197.0 0.99800 3.02 0.50 9.8
## 1418 35.5 366.5 1.00010 3.04 0.63 11.0
## 2051 15.0 123.0 0.99970 2.93 0.55 9.7
## 177 19.0 65.0 0.99100 3.03 0.41 12.6
## 527 3.0 29.0 0.99060 3.04 0.53 12.5
## 832 34.0 161.0 0.99350 3.05 0.62 11.5
## 4509 75.0 129.0 0.99020 3.20 0.38 11.5
## 2160 6.0 61.0 0.98970 3.30 0.34 12.2
## 3266 64.0 157.0 0.99688 3.42 0.44 8.0
## 874 5.0 33.0 0.99390 2.89 0.28 9.6
## 3771 15.0 143.0 0.99094 3.03 0.65 12.0
## 2117 19.0 68.0 0.99020 3.14 0.37 11.7
## 4481 55.0 122.0 0.99636 3.10 0.35 12.8
## 2374 5.0 57.0 0.99256 3.05 0.54 10.4
## 3902 4.0 10.0 0.99246 3.32 0.36 13.5
## 541 10.0 175.0 0.99530 3.46 0.55 11.4
## 4805 9.0 65.0 0.98934 3.24 0.35 12.1
## 1485 8.0 134.0 0.99580 3.14 0.50 9.1
## quality StdResid CooksD
## 4746 3 -5.317 0.147
## 3308 3 -4.432 0.019
## 741 3 -4.395 0.006
## 254 3 -4.354 0.005
## 446 3 -4.248 0.003
## 3811 3 -4.174 0.002
## 1230 3 -4.169 0.004
## 3410 3 -3.976 0.001
## 3088 3 -3.882 0.002
## 1689 3 -3.808 0.011
## 1932 3 -3.807 0.019
## 3737 4 -3.650 0.004
## 252 3 -3.647 0.004
## 1418 3 -3.321 0.013
## 2051 3 -3.265 0.008
## 177 4 -3.212 0.002
## 527 4 -3.203 0.002
## 832 4 -3.189 0.001
## 4509 4 -3.169 0.003
## 2160 4 -3.138 0.002
## 3266 3 -3.097 0.004
## 874 3 -3.014 0.005
## 3771 4 -3.012 0.002
## 2117 4 -3.003 0.001
## 4481 5 -2.987 0.009
## 2374 3 -2.922 0.002
## 3902 4 -2.876 0.005
## 541 4 -2.859 0.002
## 4805 4 -2.796 0.002
## 1485 3 -2.741 0.001
#Units with StdResid outside of the [-3:3] interval should be remove
#After concluding that there were over 20 units, we filter directly establishing intervals.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydataWine2 <- mydataWine %>%
filter(StdResid >= -3 & StdResid <= 3)
# difference in rows after the filter
print(nrow(mydataWine))
## [1] 4898
print(nrow(mydataWine2))
## [1] 4852
we lose 46 units
head(mydataWine2[order(mydataWine2$StdResid),],3)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 4437 5.9 0.22 0.45 22.6 0.120
## 2345 7.6 0.48 0.37 1.2 0.034
## 3866 4.8 0.65 0.12 1.1 0.013
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 4437 55 122 0.99636 3.10 0.35 12.8
## 2345 5 57 0.99256 3.05 0.54 10.4
## 3866 4 10 0.99246 3.32 0.36 13.5
## quality StdResid CooksD
## 4437 5 -2.987 0.009
## 2345 3 -2.922 0.002
## 3866 4 -2.876 0.005
head(mydataWine2[order(-mydataWine2$StdResid),],3)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 772 8.1 0.17 0.44 14.1 0.053
## 776 8.1 0.17 0.44 14.1 0.053
## 1202 8.2 0.37 0.36 1.0 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 772 43 145 1.0006 3.28 0.75 8.8
## 776 43 145 1.0006 3.28 0.75 8.8
## 1202 17 93 0.9906 3.04 0.32 11.7
## quality StdResid CooksD
## 772 8 2.983 0.003
## 776 8 2.983 0.003
## 1202 8 2.927 0.001
We can see that all units outside the [-3:3] interval were removed.
head(mydataWine2[order(-mydataWine2$CooksD),],10)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 4437 5.9 0.220 0.45 22.6 0.120
## 1020 7.9 0.640 0.46 10.6 0.244
## 3022 6.2 0.255 0.24 1.7 0.039
## 654 6.8 0.290 0.16 1.4 0.038
## 3866 4.8 0.650 0.12 1.1 0.013
## 18 6.2 0.660 0.48 1.2 0.029
## 21 6.2 0.660 0.48 1.2 0.029
## 772 8.1 0.170 0.44 14.1 0.053
## 776 8.1 0.170 0.44 14.1 0.053
## 1278 6.9 0.390 0.22 4.3 0.030
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 4437 55.0 122.0 0.99636 3.10 0.35 12.8
## 1020 33.0 227.0 0.99830 2.87 0.74 9.1
## 3022 138.5 272.0 0.99452 3.53 0.53 9.6
## 654 122.5 234.5 0.99220 3.15 0.47 10.0
## 3866 4.0 10.0 0.99246 3.32 0.36 13.5
## 18 29.0 75.0 0.98920 3.33 0.39 12.8
## 21 29.0 75.0 0.98920 3.33 0.39 12.8
## 772 43.0 145.0 1.00060 3.28 0.75 8.8
## 776 43.0 145.0 1.00060 3.28 0.75 8.8
## 1278 10.0 102.0 0.99300 3.00 0.87 11.6
## quality StdResid CooksD
## 4437 5 -2.987 0.009
## 1020 3 -1.984 0.008
## 3022 4 -2.610 0.007
## 654 4 -2.488 0.005
## 3866 4 -2.876 0.005
## 18 8 2.816 0.004
## 21 8 2.816 0.004
## 772 8 2.983 0.003
## 776 8 2.983 0.003
## 1278 4 -2.724 0.003
For the Cook’s distances we can not find high impact units.
fit5 <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + pH + sulphates + alcohol, data = mydataWine2)
mydataWine2$StdResid <- round(rstandard(fit5),3)
mydataWine2$CooksD <- round(cooks.distance(fit5),3)
hist(mydataWine2$StdResid,
xlab = "Standardized residuals",
ylab = "Frequency",
main = "Histogram of standardized residuals")
shapiro.test(mydataWine2$StdResid)
##
## Shapiro-Wilk normality test
##
## data: mydataWine2$StdResid
## W = 0.99553, p-value = 4.871e-11
We still reject H0 - erros are not normally distributed. However, the sample is big enough (>30)
hist(mydataWine2$CooksD,
xlab = "Cooks distance",
ylab = "Frequency",
main = "histogram of cooks distances")
WE dont see high impact units.
mydataWine2$StdFitted <- scale(fit5$fitted.values)
library(car)
scatterplot(y = mydataWine2$StdResid, x = mydataWine2$StdFitted,
ylab = "Standardized residuals",
xlab = "standardized Fitted Values ",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
install.packages("olsrr")
## Installing package into 'C:/Users/pedro/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'olsrr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\pedro\AppData\Local\Temp\RtmpykYBzT\downloaded_packages
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit5)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -----------------------------------
## Response : quality
## Variables: fitted values of quality
##
## Test Summary
## -------------------------------
## DF = 1
## Chi2 = 15.47456
## Prob > Chi2 = 8.362319e-05
WE do not reject H0, meaning Homoescedesticity
summary(fit5)
##
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + citric.acid +
## residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## pH + sulphates + alcohol, data = mydataWine2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.25224 -0.49216 -0.03514 0.45776 2.26999
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7424896 0.3308161 5.267 1.44e-07 ***
## fixed.acidity -0.0453473 0.0142176 -3.190 0.00143 **
## volatile.acidity -1.8576942 0.1079029 -17.216 < 2e-16 ***
## citric.acid 0.0085577 0.0911293 0.094 0.92519
## residual.sugar 0.0238485 0.0024259 9.831 < 2e-16 ***
## chlorides -1.0282639 0.5126297 -2.006 0.04493 *
## free.sulfur.dioxide 0.0059624 0.0008201 7.270 4.17e-13 ***
## total.sulfur.dioxide -0.0009876 0.0003556 -2.777 0.00550 **
## pH 0.2070635 0.0782786 2.645 0.00819 **
## sulphates 0.4165324 0.0922032 4.518 6.40e-06 ***
## alcohol 0.3727677 0.0106959 34.851 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7135 on 4841 degrees of freedom
## Multiple R-squared: 0.3067, Adjusted R-squared: 0.3053
## F-statistic: 214.2 on 10 and 4841 DF, p-value: < 2.2e-16
##Interoretation of the results os the regression model:
-If the Fixed Acidity increases by 1 unit of measure, the quality of the wine decreases on average 0.05 evaluation points (p=0.002). Assuming all other variables remain the same.
-If the Volatile Acidity increases by 1 unit of measure, the quality of the wine decreases on average 1.86 evaluation points (p<0.001). Assuming all other variables remain the same.
-We can not take conclusions from Citric Acid as it is not statistically significant
-If the Residual Sugar increases by 1 unit of measure, the quality of the wine increases on average 0.02 evaluation points (p<0.001). Assuming all other variables remain the same.
-If the chlorides increases by 1 unit of measure, the quality of the wine decreases on average 1.03 evaluation points (p=0.045). Assuming all other variables remain the same.
-If the Free Sulfur Dioxide increases by 1 unit of measure, the quality of the wine increases on average 0.006 evaluation points (p<0.001). Assuming all other variables remain the same.
-If the Total Sulfur Dioxide increases by 1 unit of measure, the quality of the wine decreases on average 0.001 evaluation points (p=0.006). Assuming all other variables remain the same.
-If the pH increases by 1 unit of measure, the quality of the wine increases on average 0.2 evaluation points (p=0.009). Assuming all other variables remain the same.
-If the Sulphates increases by 1 unit of measure, the quality of the wine increases on average 0.42 evaluation points (p<0.001). Assuming all other variables remain the same.
-If the Alcohol increases by 1 unit of measure, the quality of the wine increases on average 0.37 evaluation points (p<0.001). Assuming all other variables remain the same.
sqrt(summary(fit5)$r.squared)
## [1] 0.553821
-Linear relationship between the quality of wines and all explanatory variables is semi strong.
We this model we can understand how this variables, attributes based on physicochemical tests, effect the quality of white wine.