This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#load required packages/libraries
library(caTools)
library(ggplot2)
#nnet packege is required for multinomial regression
#install.packages("nnet")
library(nnet)
#install.packages("corrplot")
library(corrplot)
## corrplot 0.84 loaded
#car package is used for using VIF
library(car)
## Loading required package: carData
wine_quality<-read.csv("winequality-red.csv",header = TRUE, sep = ",")
dim(wine_quality)
## [1] 1599 12
#so there are 1599 rows in the data and there are 12 columns (variables)
#*********Exploratroy Data Analysis************
#analysing data type of all the variables
str(wine_quality)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
#Checking for Missing data/Null Values
#Out of 1599 following number of rwos has complete cases i.e data with no null values in any rows
wine_quality[!which(complete.cases(wine_quality[,all.vars(wine_quality)])),]
## [1] fixed.acidity volatile.acidity citric.acid
## [4] residual.sugar chlorides free.sulfur.dioxide
## [7] total.sulfur.dioxide density pH
## [10] sulphates alcohol quality
## <0 rows> (or 0-length row.names)
#1599 - this shows that none of the rows have values
cor(wine_quality)
## fixed.acidity volatile.acidity citric.acid
## fixed.acidity 1.00000000 -0.256130895 0.67170343
## volatile.acidity -0.25613089 1.000000000 -0.55249568
## citric.acid 0.67170343 -0.552495685 1.00000000
## residual.sugar 0.11477672 0.001917882 0.14357716
## chlorides 0.09370519 0.061297772 0.20382291
## free.sulfur.dioxide -0.15379419 -0.010503827 -0.06097813
## total.sulfur.dioxide -0.11318144 0.076470005 0.03553302
## density 0.66804729 0.022026232 0.36494718
## pH -0.68297819 0.234937294 -0.54190414
## sulphates 0.18300566 -0.260986685 0.31277004
## alcohol -0.06166827 -0.202288027 0.10990325
## quality 0.12405165 -0.390557780 0.22637251
## residual.sugar chlorides free.sulfur.dioxide
## fixed.acidity 0.114776724 0.093705186 -0.153794193
## volatile.acidity 0.001917882 0.061297772 -0.010503827
## citric.acid 0.143577162 0.203822914 -0.060978129
## residual.sugar 1.000000000 0.055609535 0.187048995
## chlorides 0.055609535 1.000000000 0.005562147
## free.sulfur.dioxide 0.187048995 0.005562147 1.000000000
## total.sulfur.dioxide 0.203027882 0.047400468 0.667666450
## density 0.355283371 0.200632327 -0.021945831
## pH -0.085652422 -0.265026131 0.070377499
## sulphates 0.005527121 0.371260481 0.051657572
## alcohol 0.042075437 -0.221140545 -0.069408354
## quality 0.013731637 -0.128906560 -0.050656057
## total.sulfur.dioxide density pH
## fixed.acidity -0.11318144 0.66804729 -0.68297819
## volatile.acidity 0.07647000 0.02202623 0.23493729
## citric.acid 0.03553302 0.36494718 -0.54190414
## residual.sugar 0.20302788 0.35528337 -0.08565242
## chlorides 0.04740047 0.20063233 -0.26502613
## free.sulfur.dioxide 0.66766645 -0.02194583 0.07037750
## total.sulfur.dioxide 1.00000000 0.07126948 -0.06649456
## density 0.07126948 1.00000000 -0.34169933
## pH -0.06649456 -0.34169933 1.00000000
## sulphates 0.04294684 0.14850641 -0.19664760
## alcohol -0.20565394 -0.49617977 0.20563251
## quality -0.18510029 -0.17491923 -0.05773139
## sulphates alcohol quality
## fixed.acidity 0.183005664 -0.06166827 0.12405165
## volatile.acidity -0.260986685 -0.20228803 -0.39055778
## citric.acid 0.312770044 0.10990325 0.22637251
## residual.sugar 0.005527121 0.04207544 0.01373164
## chlorides 0.371260481 -0.22114054 -0.12890656
## free.sulfur.dioxide 0.051657572 -0.06940835 -0.05065606
## total.sulfur.dioxide 0.042946836 -0.20565394 -0.18510029
## density 0.148506412 -0.49617977 -0.17491923
## pH -0.196647602 0.20563251 -0.05773139
## sulphates 1.000000000 0.09359475 0.25139708
## alcohol 0.093594750 1.00000000 0.47616632
## quality 0.251397079 0.47616632 1.00000000
corrplot(cor(wine_quality), type="lower")
#multiple histograms
#hist(wine_quality[,1], xlim=c(0, 3500), breaks=seq(0, 3500, 100), main=colnames[i], probability=TRUE, col="gray", border="white")
colnames <- dimnames(wine_quality)[[2]]
par(mfrow=c(1,1))
for(i in 1:2){
hist(wine_quality[,i], main=colnames(wine_quality)[i], probability=TRUE, col="orange", border="white")
}
tranformed_wine_quality<-wine_quality
tranformed_wine_quality$alcohol<-log(log(tranformed_wine_quality$alcohol))
tranformed_wine_quality$free.sulfur.dioxide<-log(tranformed_wine_quality$free.sulfur.dioxide)
tranformed_wine_quality$total.sulfur.dioxide<-log(tranformed_wine_quality$total.sulfur.dioxide)
set.seed(1234)
split = sample.split(wine_quality$quality, SplitRatio = 0.8)
training_set = subset(tranformed_wine_quality, split == TRUE)
test_set = subset(tranformed_wine_quality, split == FALSE)
model1 = lm(formula = quality ~ .,
data = training_set)
summary_model1<-summary(model1)
summary_model1
##
## Call:
## lm(formula = quality ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.32368 -0.36012 -0.07567 0.46699 2.11979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.75778 24.16304 1.190 0.234208
## fixed.acidity 0.05035 0.02914 1.728 0.084240 .
## volatile.acidity -1.10739 0.13369 -8.283 3.02e-16 ***
## citric.acid -0.21544 0.16165 -1.333 0.182854
## residual.sugar 0.01758 0.01673 1.051 0.293453
## chlorides -1.53106 0.54256 -2.822 0.004849 **
## free.sulfur.dioxide 0.12564 0.04419 2.843 0.004535 **
## total.sulfur.dioxide -0.15365 0.04485 -3.426 0.000633 ***
## density -27.89910 24.41961 -1.142 0.253467
## pH -0.25230 0.21223 -1.189 0.234733
## sulphates 0.89879 0.13048 6.888 8.89e-12 ***
## alcohol 6.45504 0.75699 8.527 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6558 on 1266 degrees of freedom
## Multiple R-squared: 0.344, Adjusted R-squared: 0.3383
## F-statistic: 60.35 on 11 and 1266 DF, p-value: < 2.2e-16
# Predicting the Test set results
y_pred = predict(model1, newdata = test_set)
y_actual<-test_set$quality
train_MSE = mean((model1$residuals)^2)
train_MSE
## [1] 0.4260819
#Test set MSE
MSPE<-mean((y_actual - y_pred) ^ 2)
MSPE
## [1] 0.3921996
df<-as.data.frame(cbind(y_actual,y_pred))
library(ggplot2)
ggplot(data=df,aes(x=as.factor(y_actual), y=y_pred))+geom_boxplot()
#applying variable selection using backward elimination
model2 = lm(formula = quality ~ .,
data = training_set)
summary_model2<-summary(model2)
summary_model2
##
## Call:
## lm(formula = quality ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.32368 -0.36012 -0.07567 0.46699 2.11979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.75778 24.16304 1.190 0.234208
## fixed.acidity 0.05035 0.02914 1.728 0.084240 .
## volatile.acidity -1.10739 0.13369 -8.283 3.02e-16 ***
## citric.acid -0.21544 0.16165 -1.333 0.182854
## residual.sugar 0.01758 0.01673 1.051 0.293453
## chlorides -1.53106 0.54256 -2.822 0.004849 **
## free.sulfur.dioxide 0.12564 0.04419 2.843 0.004535 **
## total.sulfur.dioxide -0.15365 0.04485 -3.426 0.000633 ***
## density -27.89910 24.41961 -1.142 0.253467
## pH -0.25230 0.21223 -1.189 0.234733
## sulphates 0.89879 0.13048 6.888 8.89e-12 ***
## alcohol 6.45504 0.75699 8.527 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6558 on 1266 degrees of freedom
## Multiple R-squared: 0.344, Adjusted R-squared: 0.3383
## F-statistic: 60.35 on 11 and 1266 DF, p-value: < 2.2e-16
#backward selection
step(model2,direction="backward",trace=FALSE)
##
## Call:
## lm(formula = quality ~ volatile.acidity + chlorides + free.sulfur.dioxide +
## total.sulfur.dioxide + pH + sulphates + alcohol, data = training_set)
##
## Coefficients:
## (Intercept) volatile.acidity chlorides
## 1.6635 -1.0370 -1.7453
## free.sulfur.dioxide total.sulfur.dioxide pH
## 0.1382 -0.1732 -0.4685
## sulphates alcohol
## 0.8664 6.9831
#
#forward selection
min.model = lm(quality ~ 1, data=training_set)
formula <- formula(lm(quality~.,training_set))
fwd.model = step(min.model, direction='forward', scope=formula)
## Start: AIC=-549.49
## quality ~ 1
##
## Df Sum of Sq RSS AIC
## + alcohol 1 176.568 653.51 -853.14
## + volatile.acidity 1 124.551 705.53 -755.26
## + sulphates 1 61.663 768.42 -646.14
## + citric.acid 1 46.230 783.85 -620.73
## + density 1 20.577 809.50 -579.57
## + total.sulfur.dioxide 1 18.190 811.89 -575.81
## + fixed.acidity 1 14.558 815.52 -570.11
## + chlorides 1 13.155 816.93 -567.91
## + pH 1 3.455 826.63 -552.82
## <none> 830.08 -549.49
## + free.sulfur.dioxide 1 0.422 829.66 -548.14
## + residual.sugar 1 0.251 829.83 -547.88
##
## Step: AIC=-853.14
## quality ~ alcohol
##
## Df Sum of Sq RSS AIC
## + volatile.acidity 1 75.259 578.25 -1007.50
## + sulphates 1 39.091 614.42 -929.97
## + citric.acid 1 27.306 626.21 -905.69
## + pH 1 23.036 630.48 -897.00
## + fixed.acidity 1 20.741 632.77 -892.36
## + density 1 4.334 649.18 -859.64
## + total.sulfur.dioxide 1 1.609 651.90 -854.29
## <none> 653.51 -853.14
## + chlorides 1 0.157 653.36 -851.45
## + free.sulfur.dioxide 1 0.064 653.45 -851.26
## + residual.sugar 1 0.014 653.50 -851.17
##
## Step: AIC=-1007.5
## quality ~ alcohol + volatile.acidity
##
## Df Sum of Sq RSS AIC
## + sulphates 1 17.1337 561.12 -1043.9
## + fixed.acidity 1 5.4840 572.77 -1017.7
## + pH 1 5.4751 572.78 -1017.7
## + density 1 1.6162 576.64 -1009.1
## + total.sulfur.dioxide 1 1.5266 576.73 -1008.9
## <none> 578.25 -1007.5
## + citric.acid 1 0.4164 577.84 -1006.4
## + chlorides 1 0.0649 578.19 -1005.6
## + free.sulfur.dioxide 1 0.0145 578.24 -1005.5
## + residual.sugar 1 0.0000 578.25 -1005.5
##
## Step: AIC=-1043.94
## quality ~ alcohol + volatile.acidity + sulphates
##
## Df Sum of Sq RSS AIC
## + chlorides 1 3.3055 557.82 -1049.5
## + pH 1 3.2298 557.89 -1049.3
## + fixed.acidity 1 3.1696 557.95 -1049.2
## + total.sulfur.dioxide 1 2.8727 558.25 -1048.5
## <none> 561.12 -1043.9
## + density 1 0.1386 560.98 -1042.3
## + free.sulfur.dioxide 1 0.0287 561.09 -1042.0
## + citric.acid 1 0.0129 561.11 -1042.0
## + residual.sugar 1 0.0002 561.12 -1041.9
##
## Step: AIC=-1049.49
## quality ~ alcohol + volatile.acidity + sulphates + chlorides
##
## Df Sum of Sq RSS AIC
## + pH 1 4.6363 553.18 -1058.2
## + fixed.acidity 1 3.6451 554.17 -1055.9
## + total.sulfur.dioxide 1 2.9941 554.82 -1054.4
## <none> 557.82 -1049.5
## + density 1 0.2467 557.57 -1048.1
## + citric.acid 1 0.0784 557.74 -1047.7
## + free.sulfur.dioxide 1 0.0516 557.76 -1047.6
## + residual.sugar 1 0.0276 557.79 -1047.5
##
## Step: AIC=-1058.16
## quality ~ alcohol + volatile.acidity + sulphates + chlorides +
## pH
##
## Df Sum of Sq RSS AIC
## + total.sulfur.dioxide 1 2.61560 550.56 -1062.2
## <none> 553.18 -1058.2
## + citric.acid 1 0.72663 552.45 -1057.8
## + fixed.acidity 1 0.44308 552.74 -1057.2
## + free.sulfur.dioxide 1 0.00168 553.18 -1056.2
## + density 1 0.00071 553.18 -1056.2
## + residual.sugar 1 0.00044 553.18 -1056.2
##
## Step: AIC=-1062.22
## quality ~ alcohol + volatile.acidity + sulphates + chlorides +
## pH + total.sulfur.dioxide
##
## Df Sum of Sq RSS AIC
## + free.sulfur.dioxide 1 4.3778 546.19 -1070.4
## <none> 550.56 -1062.2
## + citric.acid 1 0.5565 550.01 -1061.5
## + fixed.acidity 1 0.1770 550.39 -1060.6
## + residual.sugar 1 0.0738 550.49 -1060.4
## + density 1 0.0028 550.56 -1060.2
##
## Step: AIC=-1070.42
## quality ~ alcohol + volatile.acidity + sulphates + chlorides +
## pH + total.sulfur.dioxide + free.sulfur.dioxide
##
## Df Sum of Sq RSS AIC
## <none> 546.19 -1070.4
## + fixed.acidity 1 0.289937 545.90 -1069.1
## + citric.acid 1 0.175559 546.01 -1068.8
## + residual.sugar 1 0.088065 546.10 -1068.6
## + density 1 0.025579 546.16 -1068.5
#results
#quality ~ alcohol + volatile.acidity + sulphates + total.sulfur.dioxide + chlorides + pH + free.sulfur.dioxide
#
#################
model3<-lm(formula = quality ~ volatile.acidity + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + pH + sulphates + alcohol, data = training_set)
summary_model3<-summary(model3)
summary_model3
##
## Call:
## lm(formula = quality ~ volatile.acidity + chlorides + free.sulfur.dioxide +
## total.sulfur.dioxide + pH + sulphates + alcohol, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24794 -0.37109 -0.06992 0.47185 2.12950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.66354 0.56919 2.923 0.003532 **
## volatile.acidity -1.03698 0.11354 -9.133 < 2e-16 ***
## chlorides -1.74533 0.52507 -3.324 0.000913 ***
## free.sulfur.dioxide 0.13818 0.04331 3.190 0.001455 **
## total.sulfur.dioxide -0.17320 0.04296 -4.032 5.86e-05 ***
## pH -0.46853 0.13231 -3.541 0.000413 ***
## sulphates 0.86638 0.12565 6.895 8.45e-12 ***
## alcohol 6.98311 0.49435 14.126 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6558 on 1270 degrees of freedom
## Multiple R-squared: 0.342, Adjusted R-squared: 0.3384
## F-statistic: 94.3 on 7 and 1270 DF, p-value: < 2.2e-16
vif(model3)
## volatile.acidity chlorides free.sulfur.dioxide
## 1.232230 1.251374 2.638836
## total.sulfur.dioxide pH sulphates
## 2.752512 1.232092 1.262575
## alcohol
## 1.287294
# VIF values for all the variables are < 2 , this shows the multicolinarity problem doesn't seem to be affcting our model
y_pred3 = predict(model3, newdata = test_set)
y_actual3<-test_set$quality
#MSE
train_MSE3 = mean((model3$residuals)^2)
MSPE3<-mean((y_actual3 - y_pred3) ^ 2)
#THis is the 2nd set of models where we would consider dependent variable as a ordinal variable (categorical variable)
#Hence liner regression approach will not work.
levels(as.factor(wine_quality$quality))
## [1] "3" "4" "5" "6" "7" "8"
#as demonstrated above the categtical variable quality has 6 difference levels 3-8
#So logistic regression will not be able to classify all 6 classes at the same time as logistic regression is more suitable for a binary classification
#Hence here we will employ""
#multinomal logistic regression model
training_set$quality<-as.factor(training_set$quality)
test_set$quality<-as.factor(test_set$quality)
training_set[-12] = scale(training_set[-12])
test_set[-12] = scale(test_set[-12])
glm.fit=multinom(quality~., data=training_set)
## # weights: 78 (60 variable)
## initial value 2289.868602
## iter 10 value 1627.470969
## iter 20 value 1265.644460
## iter 30 value 1200.448243
## iter 40 value 1173.063818
## iter 50 value 1165.296391
## iter 60 value 1164.000887
## iter 70 value 1163.949313
## final value 1163.948594
## converged
summary(glm.fit)
## Call:
## multinom(formula = quality ~ ., data = training_set)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 6.409809 2.7804077 -1.962018 -2.0060549 0.52195799
## 5 9.640708 1.1679273 -2.358993 -1.8293426 -0.16062131
## 6 9.784926 1.7279007 -2.794595 -2.1042224 -0.04392396
## 7 7.556765 2.1228349 -3.216412 -2.1038674 0.28582991
## 8 3.876369 0.5254109 -2.381461 -0.8180732 -0.52986416
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 4 0.2315770 -0.8875621 2.022564 -3.450494 0.1357967
## 5 0.2229732 -0.4098571 2.362025 -2.212443 -1.1050954
## 6 0.1688509 -0.1782503 1.925406 -2.525190 -0.9280915
## 7 -0.2995520 0.2370099 1.280785 -2.664251 -0.7888335
## 8 -1.1164822 -0.4406022 1.483864 -2.291996 -1.9452987
## sulphates alcohol
## 4 -0.1287139 1.176349
## 5 0.2459895 1.506625
## 6 0.6102799 2.171551
## 7 1.0508378 2.844505
## 8 1.1513499 3.516117
##
## Std. Errors:
## (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4 2.040605 1.592743 0.6835868 0.9033522 0.4649719
## 5 2.033815 1.545408 0.6796741 0.8656888 0.4429749
## 6 2.033773 1.548275 0.6833376 0.8690179 0.4455230
## 7 2.040150 1.562479 0.7000762 0.8845010 0.4556881
## 8 2.183888 1.745202 0.8051351 1.0136004 0.6728811
## chlorides free.sulfur.dioxide total.sulfur.dioxide density pH
## 4 0.4938235 0.9811067 1.032053 1.379101 0.9984517
## 5 0.4488041 0.9635862 1.010962 1.341338 0.9801278
## 6 0.4522850 0.9656934 1.013241 1.344430 0.9820645
## 7 0.4944618 0.9797889 1.029682 1.360104 0.9935244
## 8 0.7970266 1.1121433 1.163382 1.512250 1.1187153
## sulphates alcohol
## 4 0.9041114 0.9645808
## 5 0.8617125 0.9312921
## 6 0.8630185 0.9344517
## 7 0.8679310 0.9467140
## 8 0.8940197 1.0827069
##
## Residual Deviance: 2327.897
## AIC: 2447.897
#Prediction
y_pred_multinom<-predict(glm.fit, test_set, "probs")
a1<-data.frame(y_pred_multinom)
colnames(a1)<-c(3:8)
b1 <- fitted(glm.fit)
c1<-as.data.frame(matrix(0,ncol=1,nrow=nrow(a1)))
for(i in 1:nrow(a1)){
c1[i,1]<-colnames(a1[which.max(a1[i,])])
}
table(c1)
## c1
## 3 4 5 6 7
## 1 1 154 148 17
cm1<-table(test_set$quality,c1[,1])
cm1
##
## 3 4 5 6 7
## 3 0 0 1 1 0
## 4 0 0 8 3 0
## 5 1 1 104 29 1
## 6 0 0 38 86 4
## 7 0 0 3 27 10
## 8 0 0 0 2 2
accuracy1<- (cm1[1,1]+cm1[2,2]+cm1[3,3]+cm1[4,4]+cm1[5,5])/sum(cm1)
accuracy1
## [1] 0.623053
t1111<-as.data.frame(cbind(test_set$quality,c1[,1]))
str(t1111)
## 'data.frame': 321 obs. of 2 variables:
## $ V1: Factor w/ 6 levels "1","2","3","4",..: 3 3 3 4 4 3 2 4 3 3 ...
## $ V2: Factor w/ 5 levels "3","4","5","6",..: 3 3 3 3 3 3 3 4 3 3 ...
library(caret)
## Loading required package: lattice
x1<-rep(0,6)
cm1
##
## 3 4 5 6 7
## 3 0 0 1 1 0
## 4 0 0 8 3 0
## 5 1 1 104 29 1
## 6 0 0 38 86 4
## 7 0 0 3 27 10
## 8 0 0 0 2 2
t1111<-cbind(cm1, x1)
t1111<-data.frame(cbind(cm1, x1))
#ggplot(data = t1111, aes(x=V1, y=V2, fill=value)) + geom_tile()
#plot(confusionMatrix$t1111)
qplot(quality, c1[,1], data=test_set, colour= quality, geom = c( "jitter", "abline"), main = "predicted vs. observed in validation data", xlab = "Observed Classe", ylab = "Predicted Classe")
#here we achieve an accuracy of 63.22 % on classification models with 6 categories of dependent variable
#this classification accuracy is much better than randomly clasifing the data into 6 classses which will have the accuracy
#percentage euqlas 100%/6 = 16.66% (approaximately)
#2nd multinomial logit model with selected variables
wine_quality<-read.csv("winequality-red.csv",header = TRUE, sep = ",")
training_set$quality<-as.factor(training_set$quality)
test_set$quality<-as.factor(test_set$quality)
# Feature Scaling
training_set[-12] = scale(training_set[-12])
test_set[-12] = scale(test_set[-12])
multinom_model2=multinom(quality ~ fixed.acidity + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + alcohol,
data = training_set)
## # weights: 60 (45 variable)
## initial value 2289.868602
## iter 10 value 1495.391682
## iter 20 value 1262.920575
## iter 30 value 1215.052357
## iter 40 value 1197.663219
## iter 50 value 1196.101548
## iter 60 value 1196.039980
## final value 1196.039912
## converged
#summary(multinom_model2)
#Prediction
multinom_prob2<-predict(multinom_model2, test_set, "probs")
df_multinom_prob2<-data.frame(multinom_prob2)
colnames(df_multinom_prob2)<-c(3:8)
df_class_2<-as.data.frame(matrix(0,ncol=1,nrow=nrow(df_multinom_prob2)))
for(i in 1:nrow(df_multinom_prob2)){
df_class_2[i,1]<-colnames(df_multinom_prob2[which.max(df_multinom_prob2[i,])])
}
table(df_class_2)
## df_class_2
## 4 5 6 7
## 1 164 141 15
cm2<-table(test_set$quality,df_class_2[,1])
cm2
##
## 4 5 6 7
## 3 0 1 1 0
## 4 0 8 3 0
## 5 1 106 29 0
## 6 0 47 78 3
## 7 0 2 28 10
## 8 0 0 2 2
accuracy2<- (cm2[2,1]+cm2[3,2]+cm2[4,3]+cm2[5,4])/sum(cm2)
accuracy2
## [1] 0.6043614
#here the acuracy is 59.81%
# by using the selected variable there is not any imprvement in classifciation accuracy, however the accuracy is slightly decresed.
#plots for visualization
par(mfrow=c(4,3))
for(i in 1:12){
hist(wine_quality[,i], main=colnames(wine_quality)[i], probability=TRUE, col="orange", border="white")
}
par(mfrow=c(1,1))
for(i in 1:2){
hist(wine_quality[,i], main=colnames(wine_quality)[i], probability=TRUE, col="orange", border="white")
}
colnames(wine_quality)[11]
## [1] "alcohol"
hist(wine_quality[,11], main=colnames(wine_quality)[11], probability=TRUE, col="orange", border="white")
qplot(quality, c1[,1], data=test_set, colour= quality, geom = c( "jitter", "abline"), size=4,main = "predicted vs. observed in validation data", xlab = "Observed Classe", ylab = "Predicted Classe")
#hypotheis test
#null hypothesis: the alcohol content does not make any difference in quality of wine
#alternative hypothesis: the quality of wine is different for differnt alcohal level
x1<-wine_quality$alcohol[wine_quality$quality<=5]
x2<-wine_quality$alcohol[wine_quality$quality>5]
t.test(x1, x2)
##
## Welch Two Sample t-test
##
## data: x1 and x2
## t = -19.782, df = 1516.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.020622 -0.836479
## sample estimates:
## mean of x mean of y
## 9.926478 10.855029
#test result
#pvalye isless than signifciant threshold, it impliies that we can reject null. This established that alcohol content has an impact on the quality of wine
You can also embed plots, for example: