R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#load required packages/libraries
library(caTools)
library(ggplot2)
#nnet packege is required for multinomial regression
#install.packages("nnet")
library(nnet)
#install.packages("corrplot")
library(corrplot)
## corrplot 0.84 loaded
#car package is used for using VIF
library(car)
## Loading required package: carData
wine_quality<-read.csv("winequality-red.csv",header = TRUE, sep = ",")
dim(wine_quality)
## [1] 1599   12
#so there are 1599 rows in the data and there are 12 columns (variables)
#*********Exploratroy Data Analysis************

#analysing data type of all the variables
str(wine_quality)
## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
#Checking for Missing data/Null Values

#Out of 1599 following number of rwos has complete cases i.e data with no null values in any rows
wine_quality[!which(complete.cases(wine_quality[,all.vars(wine_quality)])),]
##  [1] fixed.acidity        volatile.acidity     citric.acid         
##  [4] residual.sugar       chlorides            free.sulfur.dioxide 
##  [7] total.sulfur.dioxide density              pH                  
## [10] sulphates            alcohol              quality             
## <0 rows> (or 0-length row.names)
#1599 - this shows that none of the rows have values



cor(wine_quality)
##                      fixed.acidity volatile.acidity citric.acid
## fixed.acidity           1.00000000     -0.256130895  0.67170343
## volatile.acidity       -0.25613089      1.000000000 -0.55249568
## citric.acid             0.67170343     -0.552495685  1.00000000
## residual.sugar          0.11477672      0.001917882  0.14357716
## chlorides               0.09370519      0.061297772  0.20382291
## free.sulfur.dioxide    -0.15379419     -0.010503827 -0.06097813
## total.sulfur.dioxide   -0.11318144      0.076470005  0.03553302
## density                 0.66804729      0.022026232  0.36494718
## pH                     -0.68297819      0.234937294 -0.54190414
## sulphates               0.18300566     -0.260986685  0.31277004
## alcohol                -0.06166827     -0.202288027  0.10990325
## quality                 0.12405165     -0.390557780  0.22637251
##                      residual.sugar    chlorides free.sulfur.dioxide
## fixed.acidity           0.114776724  0.093705186        -0.153794193
## volatile.acidity        0.001917882  0.061297772        -0.010503827
## citric.acid             0.143577162  0.203822914        -0.060978129
## residual.sugar          1.000000000  0.055609535         0.187048995
## chlorides               0.055609535  1.000000000         0.005562147
## free.sulfur.dioxide     0.187048995  0.005562147         1.000000000
## total.sulfur.dioxide    0.203027882  0.047400468         0.667666450
## density                 0.355283371  0.200632327        -0.021945831
## pH                     -0.085652422 -0.265026131         0.070377499
## sulphates               0.005527121  0.371260481         0.051657572
## alcohol                 0.042075437 -0.221140545        -0.069408354
## quality                 0.013731637 -0.128906560        -0.050656057
##                      total.sulfur.dioxide     density          pH
## fixed.acidity                 -0.11318144  0.66804729 -0.68297819
## volatile.acidity               0.07647000  0.02202623  0.23493729
## citric.acid                    0.03553302  0.36494718 -0.54190414
## residual.sugar                 0.20302788  0.35528337 -0.08565242
## chlorides                      0.04740047  0.20063233 -0.26502613
## free.sulfur.dioxide            0.66766645 -0.02194583  0.07037750
## total.sulfur.dioxide           1.00000000  0.07126948 -0.06649456
## density                        0.07126948  1.00000000 -0.34169933
## pH                            -0.06649456 -0.34169933  1.00000000
## sulphates                      0.04294684  0.14850641 -0.19664760
## alcohol                       -0.20565394 -0.49617977  0.20563251
## quality                       -0.18510029 -0.17491923 -0.05773139
##                         sulphates     alcohol     quality
## fixed.acidity         0.183005664 -0.06166827  0.12405165
## volatile.acidity     -0.260986685 -0.20228803 -0.39055778
## citric.acid           0.312770044  0.10990325  0.22637251
## residual.sugar        0.005527121  0.04207544  0.01373164
## chlorides             0.371260481 -0.22114054 -0.12890656
## free.sulfur.dioxide   0.051657572 -0.06940835 -0.05065606
## total.sulfur.dioxide  0.042946836 -0.20565394 -0.18510029
## density               0.148506412 -0.49617977 -0.17491923
## pH                   -0.196647602  0.20563251 -0.05773139
## sulphates             1.000000000  0.09359475  0.25139708
## alcohol               0.093594750  1.00000000  0.47616632
## quality               0.251397079  0.47616632  1.00000000
corrplot(cor(wine_quality), type="lower")

#multiple histograms

#hist(wine_quality[,1], xlim=c(0, 3500), breaks=seq(0, 3500, 100), main=colnames[i], probability=TRUE, col="gray", border="white")
colnames <- dimnames(wine_quality)[[2]]

par(mfrow=c(1,1))
for(i in 1:2){
hist(wine_quality[,i], main=colnames(wine_quality)[i], probability=TRUE, col="orange", border="white")
}

tranformed_wine_quality<-wine_quality
tranformed_wine_quality$alcohol<-log(log(tranformed_wine_quality$alcohol))
tranformed_wine_quality$free.sulfur.dioxide<-log(tranformed_wine_quality$free.sulfur.dioxide)
tranformed_wine_quality$total.sulfur.dioxide<-log(tranformed_wine_quality$total.sulfur.dioxide)

set.seed(1234)
split = sample.split(wine_quality$quality, SplitRatio = 0.8)
training_set = subset(tranformed_wine_quality, split == TRUE)
test_set = subset(tranformed_wine_quality, split == FALSE)


model1 = lm(formula = quality ~ .,
               data = training_set)
summary_model1<-summary(model1)
summary_model1
## 
## Call:
## lm(formula = quality ~ ., data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.32368 -0.36012 -0.07567  0.46699  2.11979 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           28.75778   24.16304   1.190 0.234208    
## fixed.acidity          0.05035    0.02914   1.728 0.084240 .  
## volatile.acidity      -1.10739    0.13369  -8.283 3.02e-16 ***
## citric.acid           -0.21544    0.16165  -1.333 0.182854    
## residual.sugar         0.01758    0.01673   1.051 0.293453    
## chlorides             -1.53106    0.54256  -2.822 0.004849 ** 
## free.sulfur.dioxide    0.12564    0.04419   2.843 0.004535 ** 
## total.sulfur.dioxide  -0.15365    0.04485  -3.426 0.000633 ***
## density              -27.89910   24.41961  -1.142 0.253467    
## pH                    -0.25230    0.21223  -1.189 0.234733    
## sulphates              0.89879    0.13048   6.888 8.89e-12 ***
## alcohol                6.45504    0.75699   8.527  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6558 on 1266 degrees of freedom
## Multiple R-squared:  0.344,  Adjusted R-squared:  0.3383 
## F-statistic: 60.35 on 11 and 1266 DF,  p-value: < 2.2e-16
# Predicting the Test set results
y_pred = predict(model1, newdata = test_set)

y_actual<-test_set$quality




train_MSE = mean((model1$residuals)^2)
train_MSE
## [1] 0.4260819
#Test set MSE
MSPE<-mean((y_actual - y_pred) ^ 2)
MSPE
## [1] 0.3921996
df<-as.data.frame(cbind(y_actual,y_pred))

library(ggplot2)
ggplot(data=df,aes(x=as.factor(y_actual), y=y_pred))+geom_boxplot()

#applying variable selection using backward elimination


model2 = lm(formula = quality ~ .,
               data = training_set)

summary_model2<-summary(model2)
summary_model2
## 
## Call:
## lm(formula = quality ~ ., data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.32368 -0.36012 -0.07567  0.46699  2.11979 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           28.75778   24.16304   1.190 0.234208    
## fixed.acidity          0.05035    0.02914   1.728 0.084240 .  
## volatile.acidity      -1.10739    0.13369  -8.283 3.02e-16 ***
## citric.acid           -0.21544    0.16165  -1.333 0.182854    
## residual.sugar         0.01758    0.01673   1.051 0.293453    
## chlorides             -1.53106    0.54256  -2.822 0.004849 ** 
## free.sulfur.dioxide    0.12564    0.04419   2.843 0.004535 ** 
## total.sulfur.dioxide  -0.15365    0.04485  -3.426 0.000633 ***
## density              -27.89910   24.41961  -1.142 0.253467    
## pH                    -0.25230    0.21223  -1.189 0.234733    
## sulphates              0.89879    0.13048   6.888 8.89e-12 ***
## alcohol                6.45504    0.75699   8.527  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6558 on 1266 degrees of freedom
## Multiple R-squared:  0.344,  Adjusted R-squared:  0.3383 
## F-statistic: 60.35 on 11 and 1266 DF,  p-value: < 2.2e-16
#backward selection
step(model2,direction="backward",trace=FALSE)
## 
## Call:
## lm(formula = quality ~ volatile.acidity + chlorides + free.sulfur.dioxide + 
##     total.sulfur.dioxide + pH + sulphates + alcohol, data = training_set)
## 
## Coefficients:
##          (Intercept)      volatile.acidity             chlorides  
##               1.6635               -1.0370               -1.7453  
##  free.sulfur.dioxide  total.sulfur.dioxide                    pH  
##               0.1382               -0.1732               -0.4685  
##            sulphates               alcohol  
##               0.8664                6.9831
#

#forward selection
min.model = lm(quality ~ 1, data=training_set)

formula <- formula(lm(quality~.,training_set))

fwd.model = step(min.model, direction='forward', scope=formula)
## Start:  AIC=-549.49
## quality ~ 1
## 
##                        Df Sum of Sq    RSS     AIC
## + alcohol               1   176.568 653.51 -853.14
## + volatile.acidity      1   124.551 705.53 -755.26
## + sulphates             1    61.663 768.42 -646.14
## + citric.acid           1    46.230 783.85 -620.73
## + density               1    20.577 809.50 -579.57
## + total.sulfur.dioxide  1    18.190 811.89 -575.81
## + fixed.acidity         1    14.558 815.52 -570.11
## + chlorides             1    13.155 816.93 -567.91
## + pH                    1     3.455 826.63 -552.82
## <none>                              830.08 -549.49
## + free.sulfur.dioxide   1     0.422 829.66 -548.14
## + residual.sugar        1     0.251 829.83 -547.88
## 
## Step:  AIC=-853.14
## quality ~ alcohol
## 
##                        Df Sum of Sq    RSS      AIC
## + volatile.acidity      1    75.259 578.25 -1007.50
## + sulphates             1    39.091 614.42  -929.97
## + citric.acid           1    27.306 626.21  -905.69
## + pH                    1    23.036 630.48  -897.00
## + fixed.acidity         1    20.741 632.77  -892.36
## + density               1     4.334 649.18  -859.64
## + total.sulfur.dioxide  1     1.609 651.90  -854.29
## <none>                              653.51  -853.14
## + chlorides             1     0.157 653.36  -851.45
## + free.sulfur.dioxide   1     0.064 653.45  -851.26
## + residual.sugar        1     0.014 653.50  -851.17
## 
## Step:  AIC=-1007.5
## quality ~ alcohol + volatile.acidity
## 
##                        Df Sum of Sq    RSS     AIC
## + sulphates             1   17.1337 561.12 -1043.9
## + fixed.acidity         1    5.4840 572.77 -1017.7
## + pH                    1    5.4751 572.78 -1017.7
## + density               1    1.6162 576.64 -1009.1
## + total.sulfur.dioxide  1    1.5266 576.73 -1008.9
## <none>                              578.25 -1007.5
## + citric.acid           1    0.4164 577.84 -1006.4
## + chlorides             1    0.0649 578.19 -1005.6
## + free.sulfur.dioxide   1    0.0145 578.24 -1005.5
## + residual.sugar        1    0.0000 578.25 -1005.5
## 
## Step:  AIC=-1043.94
## quality ~ alcohol + volatile.acidity + sulphates
## 
##                        Df Sum of Sq    RSS     AIC
## + chlorides             1    3.3055 557.82 -1049.5
## + pH                    1    3.2298 557.89 -1049.3
## + fixed.acidity         1    3.1696 557.95 -1049.2
## + total.sulfur.dioxide  1    2.8727 558.25 -1048.5
## <none>                              561.12 -1043.9
## + density               1    0.1386 560.98 -1042.3
## + free.sulfur.dioxide   1    0.0287 561.09 -1042.0
## + citric.acid           1    0.0129 561.11 -1042.0
## + residual.sugar        1    0.0002 561.12 -1041.9
## 
## Step:  AIC=-1049.49
## quality ~ alcohol + volatile.acidity + sulphates + chlorides
## 
##                        Df Sum of Sq    RSS     AIC
## + pH                    1    4.6363 553.18 -1058.2
## + fixed.acidity         1    3.6451 554.17 -1055.9
## + total.sulfur.dioxide  1    2.9941 554.82 -1054.4
## <none>                              557.82 -1049.5
## + density               1    0.2467 557.57 -1048.1
## + citric.acid           1    0.0784 557.74 -1047.7
## + free.sulfur.dioxide   1    0.0516 557.76 -1047.6
## + residual.sugar        1    0.0276 557.79 -1047.5
## 
## Step:  AIC=-1058.16
## quality ~ alcohol + volatile.acidity + sulphates + chlorides + 
##     pH
## 
##                        Df Sum of Sq    RSS     AIC
## + total.sulfur.dioxide  1   2.61560 550.56 -1062.2
## <none>                              553.18 -1058.2
## + citric.acid           1   0.72663 552.45 -1057.8
## + fixed.acidity         1   0.44308 552.74 -1057.2
## + free.sulfur.dioxide   1   0.00168 553.18 -1056.2
## + density               1   0.00071 553.18 -1056.2
## + residual.sugar        1   0.00044 553.18 -1056.2
## 
## Step:  AIC=-1062.22
## quality ~ alcohol + volatile.acidity + sulphates + chlorides + 
##     pH + total.sulfur.dioxide
## 
##                       Df Sum of Sq    RSS     AIC
## + free.sulfur.dioxide  1    4.3778 546.19 -1070.4
## <none>                             550.56 -1062.2
## + citric.acid          1    0.5565 550.01 -1061.5
## + fixed.acidity        1    0.1770 550.39 -1060.6
## + residual.sugar       1    0.0738 550.49 -1060.4
## + density              1    0.0028 550.56 -1060.2
## 
## Step:  AIC=-1070.42
## quality ~ alcohol + volatile.acidity + sulphates + chlorides + 
##     pH + total.sulfur.dioxide + free.sulfur.dioxide
## 
##                  Df Sum of Sq    RSS     AIC
## <none>                        546.19 -1070.4
## + fixed.acidity   1  0.289937 545.90 -1069.1
## + citric.acid     1  0.175559 546.01 -1068.8
## + residual.sugar  1  0.088065 546.10 -1068.6
## + density         1  0.025579 546.16 -1068.5
#results
#quality ~ alcohol + volatile.acidity + sulphates + total.sulfur.dioxide + chlorides + pH + free.sulfur.dioxide
#    
#################

model3<-lm(formula = quality ~ volatile.acidity + chlorides + free.sulfur.dioxide + total.sulfur.dioxide + pH + sulphates + alcohol, data = training_set)

summary_model3<-summary(model3)
summary_model3
## 
## Call:
## lm(formula = quality ~ volatile.acidity + chlorides + free.sulfur.dioxide + 
##     total.sulfur.dioxide + pH + sulphates + alcohol, data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.24794 -0.37109 -0.06992  0.47185  2.12950 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           1.66354    0.56919   2.923 0.003532 ** 
## volatile.acidity     -1.03698    0.11354  -9.133  < 2e-16 ***
## chlorides            -1.74533    0.52507  -3.324 0.000913 ***
## free.sulfur.dioxide   0.13818    0.04331   3.190 0.001455 ** 
## total.sulfur.dioxide -0.17320    0.04296  -4.032 5.86e-05 ***
## pH                   -0.46853    0.13231  -3.541 0.000413 ***
## sulphates             0.86638    0.12565   6.895 8.45e-12 ***
## alcohol               6.98311    0.49435  14.126  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6558 on 1270 degrees of freedom
## Multiple R-squared:  0.342,  Adjusted R-squared:  0.3384 
## F-statistic:  94.3 on 7 and 1270 DF,  p-value: < 2.2e-16
vif(model3)
##     volatile.acidity            chlorides  free.sulfur.dioxide 
##             1.232230             1.251374             2.638836 
## total.sulfur.dioxide                   pH            sulphates 
##             2.752512             1.232092             1.262575 
##              alcohol 
##             1.287294
# VIF values for all the variables are < 2 , this shows the multicolinarity problem doesn't seem to be affcting our model 

y_pred3 = predict(model3, newdata = test_set)

y_actual3<-test_set$quality




#MSE
train_MSE3 = mean((model3$residuals)^2)
MSPE3<-mean((y_actual3 - y_pred3) ^ 2)
#THis is the 2nd set of models where we would consider dependent variable as a ordinal variable (categorical variable) 
#Hence liner regression approach will not work.

levels(as.factor(wine_quality$quality))
## [1] "3" "4" "5" "6" "7" "8"
#as demonstrated above the categtical variable quality has 6 difference levels 3-8
#So logistic regression will not be able to classify all 6 classes at the same time as logistic regression is more suitable for a binary classification

#Hence here we will employ""
#multinomal logistic regression model

training_set$quality<-as.factor(training_set$quality)
test_set$quality<-as.factor(test_set$quality)
training_set[-12] = scale(training_set[-12])
test_set[-12] = scale(test_set[-12])

glm.fit=multinom(quality~., data=training_set)
## # weights:  78 (60 variable)
## initial  value 2289.868602 
## iter  10 value 1627.470969
## iter  20 value 1265.644460
## iter  30 value 1200.448243
## iter  40 value 1173.063818
## iter  50 value 1165.296391
## iter  60 value 1164.000887
## iter  70 value 1163.949313
## final  value 1163.948594 
## converged
summary(glm.fit)
## Call:
## multinom(formula = quality ~ ., data = training_set)
## 
## Coefficients:
##   (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4    6.409809     2.7804077        -1.962018  -2.0060549     0.52195799
## 5    9.640708     1.1679273        -2.358993  -1.8293426    -0.16062131
## 6    9.784926     1.7279007        -2.794595  -2.1042224    -0.04392396
## 7    7.556765     2.1228349        -3.216412  -2.1038674     0.28582991
## 8    3.876369     0.5254109        -2.381461  -0.8180732    -0.52986416
##    chlorides free.sulfur.dioxide total.sulfur.dioxide   density         pH
## 4  0.2315770          -0.8875621             2.022564 -3.450494  0.1357967
## 5  0.2229732          -0.4098571             2.362025 -2.212443 -1.1050954
## 6  0.1688509          -0.1782503             1.925406 -2.525190 -0.9280915
## 7 -0.2995520           0.2370099             1.280785 -2.664251 -0.7888335
## 8 -1.1164822          -0.4406022             1.483864 -2.291996 -1.9452987
##    sulphates  alcohol
## 4 -0.1287139 1.176349
## 5  0.2459895 1.506625
## 6  0.6102799 2.171551
## 7  1.0508378 2.844505
## 8  1.1513499 3.516117
## 
## Std. Errors:
##   (Intercept) fixed.acidity volatile.acidity citric.acid residual.sugar
## 4    2.040605      1.592743        0.6835868   0.9033522      0.4649719
## 5    2.033815      1.545408        0.6796741   0.8656888      0.4429749
## 6    2.033773      1.548275        0.6833376   0.8690179      0.4455230
## 7    2.040150      1.562479        0.7000762   0.8845010      0.4556881
## 8    2.183888      1.745202        0.8051351   1.0136004      0.6728811
##   chlorides free.sulfur.dioxide total.sulfur.dioxide  density        pH
## 4 0.4938235           0.9811067             1.032053 1.379101 0.9984517
## 5 0.4488041           0.9635862             1.010962 1.341338 0.9801278
## 6 0.4522850           0.9656934             1.013241 1.344430 0.9820645
## 7 0.4944618           0.9797889             1.029682 1.360104 0.9935244
## 8 0.7970266           1.1121433             1.163382 1.512250 1.1187153
##   sulphates   alcohol
## 4 0.9041114 0.9645808
## 5 0.8617125 0.9312921
## 6 0.8630185 0.9344517
## 7 0.8679310 0.9467140
## 8 0.8940197 1.0827069
## 
## Residual Deviance: 2327.897 
## AIC: 2447.897
#Prediction
y_pred_multinom<-predict(glm.fit, test_set, "probs")

a1<-data.frame(y_pred_multinom)
colnames(a1)<-c(3:8)
b1 <- fitted(glm.fit)

c1<-as.data.frame(matrix(0,ncol=1,nrow=nrow(a1)))

for(i in 1:nrow(a1)){
  
  c1[i,1]<-colnames(a1[which.max(a1[i,])])
}
table(c1)
## c1
##   3   4   5   6   7 
##   1   1 154 148  17
cm1<-table(test_set$quality,c1[,1])
cm1
##    
##       3   4   5   6   7
##   3   0   0   1   1   0
##   4   0   0   8   3   0
##   5   1   1 104  29   1
##   6   0   0  38  86   4
##   7   0   0   3  27  10
##   8   0   0   0   2   2
accuracy1<- (cm1[1,1]+cm1[2,2]+cm1[3,3]+cm1[4,4]+cm1[5,5])/sum(cm1)
accuracy1
## [1] 0.623053
t1111<-as.data.frame(cbind(test_set$quality,c1[,1]))
str(t1111)
## 'data.frame':    321 obs. of  2 variables:
##  $ V1: Factor w/ 6 levels "1","2","3","4",..: 3 3 3 4 4 3 2 4 3 3 ...
##  $ V2: Factor w/ 5 levels "3","4","5","6",..: 3 3 3 3 3 3 3 4 3 3 ...
library(caret)
## Loading required package: lattice
x1<-rep(0,6)


cm1
##    
##       3   4   5   6   7
##   3   0   0   1   1   0
##   4   0   0   8   3   0
##   5   1   1 104  29   1
##   6   0   0  38  86   4
##   7   0   0   3  27  10
##   8   0   0   0   2   2
t1111<-cbind(cm1, x1)
t1111<-data.frame(cbind(cm1, x1))

#ggplot(data = t1111, aes(x=V1, y=V2, fill=value)) + geom_tile()
#plot(confusionMatrix$t1111)

qplot(quality, c1[,1], data=test_set,  colour= quality, geom = c( "jitter", "abline"), main = "predicted vs. observed in validation data", xlab = "Observed Classe", ylab = "Predicted Classe")

#here we achieve an accuracy of 63.22 %  on classification models with 6 categories of dependent variable
#this classification accuracy is much better than randomly clasifing the data into 6 classses which will have the accuracy
#percentage euqlas 100%/6 = 16.66% (approaximately)
#2nd multinomial logit model with selected variables

wine_quality<-read.csv("winequality-red.csv",header = TRUE, sep = ",")

training_set$quality<-as.factor(training_set$quality)
test_set$quality<-as.factor(test_set$quality)
# Feature Scaling
training_set[-12] = scale(training_set[-12])
test_set[-12] = scale(test_set[-12])

multinom_model2=multinom(quality ~ fixed.acidity + volatile.acidity + residual.sugar + 
    free.sulfur.dioxide + density + pH + sulphates + alcohol, 
data = training_set)
## # weights:  60 (45 variable)
## initial  value 2289.868602 
## iter  10 value 1495.391682
## iter  20 value 1262.920575
## iter  30 value 1215.052357
## iter  40 value 1197.663219
## iter  50 value 1196.101548
## iter  60 value 1196.039980
## final  value 1196.039912 
## converged
#summary(multinom_model2)

#Prediction
multinom_prob2<-predict(multinom_model2, test_set, "probs")

df_multinom_prob2<-data.frame(multinom_prob2)


colnames(df_multinom_prob2)<-c(3:8)


df_class_2<-as.data.frame(matrix(0,ncol=1,nrow=nrow(df_multinom_prob2)))

for(i in 1:nrow(df_multinom_prob2)){
  
  df_class_2[i,1]<-colnames(df_multinom_prob2[which.max(df_multinom_prob2[i,])])
}
table(df_class_2)
## df_class_2
##   4   5   6   7 
##   1 164 141  15
cm2<-table(test_set$quality,df_class_2[,1])
cm2
##    
##       4   5   6   7
##   3   0   1   1   0
##   4   0   8   3   0
##   5   1 106  29   0
##   6   0  47  78   3
##   7   0   2  28  10
##   8   0   0   2   2
accuracy2<- (cm2[2,1]+cm2[3,2]+cm2[4,3]+cm2[5,4])/sum(cm2)

accuracy2
## [1] 0.6043614
#here the acuracy is 59.81%
# by using the selected variable there is not any imprvement in classifciation accuracy, however the accuracy is slightly decresed.
#plots for visualization
par(mfrow=c(4,3))

for(i in 1:12){
  hist(wine_quality[,i], main=colnames(wine_quality)[i], probability=TRUE, col="orange", border="white")
}

par(mfrow=c(1,1))
for(i in 1:2){
hist(wine_quality[,i], main=colnames(wine_quality)[i], probability=TRUE, col="orange", border="white")
}

colnames(wine_quality)[11]
## [1] "alcohol"
hist(wine_quality[,11], main=colnames(wine_quality)[11], probability=TRUE, col="orange", border="white")

qplot(quality, c1[,1], data=test_set,  colour= quality, geom = c( "jitter", "abline"), size=4,main = "predicted vs. observed in validation data", xlab = "Observed Classe", ylab = "Predicted Classe")

#hypotheis test

#null hypothesis: the alcohol content does not make any difference in quality of wine
#alternative hypothesis: the quality of wine is different for differnt alcohal level
x1<-wine_quality$alcohol[wine_quality$quality<=5]
x2<-wine_quality$alcohol[wine_quality$quality>5]
t.test(x1, x2)
## 
##  Welch Two Sample t-test
## 
## data:  x1 and x2
## t = -19.782, df = 1516.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.020622 -0.836479
## sample estimates:
## mean of x mean of y 
##  9.926478 10.855029
#test result
#pvalye isless than signifciant threshold, it impliies that we can reject null. This established that alcohol content has an impact on the quality of wine

Including Plots

You can also embed plots, for example: