Data Preparation
#glm(y ~ offset(log(exposure)) + x, family=poisson(link=log) )
#glm.nb
#zeroinflated #zeroinfl()
#Conwell-Maxwell-Poisson
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(corrplot)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(cramer)
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
exp<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/wine-evaluation-data.csv")
train<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/wine-training-data.csv")
hist(train$TARGET,col="violet",xlab="",main = "Histogram of Wine Counts")

mean(train$TARGET)
## [1] 3.029074
sd(train$TARGET)
## [1] 1.926368
##Not obnoxiously underdispursed.
##Zero Inflated
corrplot(as.matrix(cor(train[-1],use = "pairwise.complete")),method = "shade")

##Run a KS test on distribitions of TARGET based on nulls for certain variables, to determine if we should use them as binary predictors
##Do x and y come from the same distribution? Lower when different, Higher when same
pvals<-c(1-ks.test(train[is.na(train$ResidualSugar),2],train[!is.na(train$ResidualSugar),2])$p.value,
1-ks.test(train[is.na(train$Chlorides),2],train[!is.na(train$Chlorides),2])$p.value,
1-ks.test(train[is.na(train$FreeSulfurDioxide),2],train[!is.na(train$FreeSulfurDioxide),2])$p.value,
1-ks.test(train[is.na(train$TotalSulfurDioxide),2],train[!is.na(train$TotalSulfurDioxide),2])$p.value,
1-ks.test(train[is.na(train$pH),2],train[!is.na(train$pH),2])$p.value,
1-ks.test(train[is.na(train$Sulphates),2],train[!is.na(train$Sulphates),2])$p.value,
1-ks.test(train[is.na(train$Alcohol),2],train[!is.na(train$Alcohol),2])$p.value,
1-ks.test(train[is.na(train$STARS),2],train[!is.na(train$STARS),2])$p.value)
## Warning in ks.test(train[is.na(train$ResidualSugar), 2], train[!is.na(train
## $ResidualSugar), : p-value will be approximate in the presence of ties
## Warning in ks.test(train[is.na(train$Chlorides), 2], train[!is.na(train
## $Chlorides), : p-value will be approximate in the presence of ties
## Warning in ks.test(train[is.na(train$FreeSulfurDioxide), 2], train[!
## is.na(train$FreeSulfurDioxide), : p-value will be approximate in the
## presence of ties
## Warning in ks.test(train[is.na(train$TotalSulfurDioxide), 2], train[!
## is.na(train$TotalSulfurDioxide), : p-value will be approximate in the
## presence of ties
## Warning in ks.test(train[is.na(train$pH), 2], train[!is.na(train$pH), 2]):
## p-value will be approximate in the presence of ties
## Warning in ks.test(train[is.na(train$Sulphates), 2], train[!is.na(train
## $Sulphates), : p-value will be approximate in the presence of ties
## Warning in ks.test(train[is.na(train$Alcohol), 2], train[!is.na(train
## $Alcohol), : p-value will be approximate in the presence of ties
## Warning in ks.test(train[is.na(train$STARS), 2], train[!is.na(train
## $STARS), : p-value will be approximate in the presence of ties
names<-c("ResidualSugar","Chlorides","FreeSulfur","Total Sulfur","pH","Sulphates","Alcohol","STARS")
data.frame(names,pvals)
## names pvals
## 1 ResidualSugar 0.4000466013
## 2 Chlorides 0.3753065208
## 3 FreeSulfur 0.0142159411
## 4 Total Sulfur 0.1437760700
## 5 pH 0.5677559630
## 6 Sulphates 0.1602013270
## 7 Alcohol 0.0006181788
## 8 STARS 1.0000000000
traini<-train
traini$FreeSulfurDioxideBIN<-as.numeric(is.na(traini$FreeSulfurDioxide))
traini$AlcoholBIN<-as.numeric(is.na(traini$Alcohol))
traini$starsna<-as.numeric(is.na(train$STARS))
traini$sugarna<-as.numeric(is.na(train$ResidualSugar))
traini$chloridesna<-as.numeric(is.na(train$Chlorides))
traini$freesulfurna<-as.numeric(is.na(train$FreeSulfurDioxide))
traini$totalsulfurna<-as.numeric(is.na(train$TotalSulfurDioxide))
traini$pHna<-as.numeric(is.na(train$pH))
traini$sulphatesna<-as.numeric(is.na(train$Sulphates))
Combining and Imputing
#imput median
for(i in 3:ncol(traini)){
traini[is.na(traini[,i]), i] <- median(as.numeric(traini[,i]), na.rm = TRUE)
}
cormat<-as.matrix(cor(traini[-c(1,2,17,18)],use = "pairwise.complete"))
NFeigen<-eigen(cormat)
Nfact<-sum(as.numeric(NFeigen$values >=1))
Nfact
## [1] 10
library(nFactors)
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
##
## logit
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'nFactors'
## The following object is masked from 'package:lattice':
##
## parallel
ev <- NFeigen # get eigenvalues
ap <- parallel(subject=nrow(traini[-c(1,2,17,18)]),var=ncol(traini[-c(1,2,17,18)]),
rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

componentAxis(cormat, nFactors=6)
## $values
## [1] 1.400333 1.241261 1.067708 1.066332 1.051909 1.038167
##
## $varExplained
## [1] 6.67 5.91 5.08 5.08 5.01 4.94
##
## $cumVarExplained
## [1] 6.67 12.58 17.66 22.74 27.75 32.69
##
## $loadings
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.34746071 0.428203145 -0.02467452 -0.16837412 -0.23650564
## [2,] -0.22007217 0.046163111 0.10539815 0.39599637 0.20885358
## [3,] -0.05090808 0.168652740 -0.24017424 -0.42894452 -0.03939924
## [4,] 0.07375891 -0.064506720 -0.45369088 0.22140786 -0.19336970
## [5,] -0.09089572 0.108819357 -0.07635596 0.25302133 0.20102389
## [6,] 0.09852688 -0.109512176 -0.12239003 -0.04491842 -0.52467487
## [7,] 0.13864798 -0.173606259 -0.15645510 -0.03407998 -0.36977368
## [8,] -0.12739178 0.035720573 0.03416182 0.32671312 -0.20442978
## [9,] 0.06702037 -0.133083914 0.11773136 0.40972820 -0.12990528
## [10,] -0.11681342 0.095001103 0.27543064 -0.07369860 -0.28131563
## [11,] 0.15963922 0.046527455 0.30537939 -0.24865672 0.38100275
## [12,] 0.50610610 0.607385739 -0.04517861 0.13335924 -0.05461606
## [13,] -0.55069022 0.506021811 -0.11228994 -0.08244424 -0.07365447
## [14,] 0.54733816 0.533262711 0.05927593 0.16057715 0.01065830
## [15,] -0.51462651 0.167526461 0.08715146 0.24137994 0.04327930
## [16,] 0.01429524 0.001130028 0.28986068 0.17046689 -0.21866805
## [17,] 0.02475196 -0.010165449 0.51092204 -0.13581881 -0.15890549
## [18,] -0.02157753 0.027880919 0.21372714 0.02542797 -0.10022560
## [19,] 0.01450723 0.003115052 0.22437129 0.03131073 -0.25542342
## [20,] 0.01405612 0.025250520 -0.25755219 -0.07111424 0.07987233
## [21,] -0.02416406 -0.008803681 -0.17280955 0.27614132 0.11352313
## [,6]
## [1,] 0.143378781
## [2,] 0.307455477
## [3,] 0.128727808
## [4,] 0.041257341
## [5,] -0.508737165
## [6,] 0.241960741
## [7,] -0.245429980
## [8,] -0.378662593
## [9,] 0.385105746
## [10,] 0.200980426
## [11,] 0.156415643
## [12,] 0.006365719
## [13,] -0.015654669
## [14,] -0.004185374
## [15,] -0.013391024
## [16,] 0.085446252
## [17,] -0.145412498
## [18,] -0.214452189
## [19,] -0.150291516
## [20,] -0.048953071
## [21,] 0.266008832
##
## $communalities
## [1] 0.40953795 0.35663338 0.29083559 0.30355236 0.38917782 0.37252617
## [7] 0.27197049 0.31059001 0.36912274 0.22349572 0.35236517 0.64791005
## [13] 0.58439393 0.61337795 0.36081764 0.16840058 0.32659983 0.10360372
## [19] 0.13937166 0.08100152 0.19042678
f<-factanal(traini[-c(1,2,17,18)]
, factors = 6,
scores = "Bartlett",rotation = "varimax")
#write.csv(f$correlation,"xzxyt.csv")
traini$STARLABEL<-traini$LabelAppeal+traini$STARS+2
##
Model Building
library(caret)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:boot':
##
## logit
traini$lin2fit<-traini$ï..INDEX
traini$TARGETBIN<-traini$ï..INDEX
lin1<-lm(data = traini,TARGET~.-ï..INDEX-STARS-LabelAppeal-lin2fit-sulphatesna-Sulphates-totalsulfurna-FixedAcidity-pHna-CitricAcid-ResidualSugar-FreeSulfurDioxideBIN-freesulfurna-AlcoholBIN-chloridesna-FreeSulfurDioxide-TARGETBIN-pH-Density-sugarna)
wts1 <- 1/fitted(lm(abs(residuals(lin1)) ~ fitted(lin1)))^2
lin1w<-lm(data = traini,TARGET~.-ï..INDEX-STARS-LabelAppeal-lin2fit-sulphatesna-Sulphates-totalsulfurna-FixedAcidity-pHna-CitricAcid-ResidualSugar-FreeSulfurDioxideBIN-freesulfurna-AlcoholBIN-chloridesna-FreeSulfurDioxide-TARGETBIN-pH-Density-sugarna,weights=wts1)
summary(lin1w)
##
## Call:
## lm(formula = TARGET ~ . - ï..INDEX - STARS - LabelAppeal - lin2fit -
## sulphatesna - Sulphates - totalsulfurna - FixedAcidity -
## pHna - CitricAcid - ResidualSugar - FreeSulfurDioxideBIN -
## freesulfurna - AlcoholBIN - chloridesna - FreeSulfurDioxide -
## TARGETBIN - pH - Density - sugarna, data = traini, weights = wts1)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -7.3788 -0.7600 0.0452 0.8674 5.0983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.218e+00 8.617e-02 25.736 < 2e-16 ***
## VolatileAcidity -8.475e-02 1.368e-02 -6.197 5.94e-10 ***
## Chlorides -9.538e-02 3.458e-02 -2.758 0.00582 **
## TotalSulfurDioxide 1.448e-04 4.783e-05 3.027 0.00248 **
## Alcohol 1.779e-02 2.960e-03 6.012 1.88e-09 ***
## AcidIndex -1.753e-01 9.206e-03 -19.044 < 2e-16 ***
## starsna -2.315e+00 3.188e-02 -72.618 < 2e-16 ***
## STARLABEL 6.479e-01 7.343e-03 88.237 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.288 on 12787 degrees of freedom
## Multiple R-squared: 0.5707, Adjusted R-squared: 0.5705
## F-statistic: 2429 on 7 and 12787 DF, p-value: < 2.2e-16
vif(lin1w)
## VolatileAcidity Chlorides TotalSulfurDioxide
## 1.005877 1.001563 1.001915
## Alcohol AcidIndex starsna
## 1.007459 1.029928 1.048985
## STARLABEL
## 1.030972
##Performance
lin1fit<-lin1w$fitted.values
lin1fit[lin1fit<0]<-0
lin1fit<-round(lin1fit)
hist(lin1fit)

#write.csv(data.frame(lin1fit,traini$TARGET),"mod1.csv")
##Assign any negative values zero, also round
Linear model 2, 2 stage
##Stage 1 zeros
traini$TARGETBIN<-as.numeric(traini$TARGET==0)
zero<-glm(data=traini,TARGETBIN~.-ï..INDEX-TARGET-TARGETBIN-lin2fit-traini$lin2fit-STARS-traini$LabelAppeal-freesulfurna-FixedAcidity-sugarna-Density-chloridesna-FreeSulfurDioxideBIN-sulphatesna-AlcoholBIN-totalsulfurna-CitricAcid-ResidualSugar-Chlorides)
summary(zero)
##
## Call:
## glm(formula = TARGETBIN ~ . - ï..INDEX - TARGET - TARGETBIN -
## lin2fit - traini$lin2fit - STARS - traini$LabelAppeal - freesulfurna -
## FixedAcidity - sugarna - Density - chloridesna - FreeSulfurDioxideBIN -
## sulphatesna - AlcoholBIN - totalsulfurna - CitricAcid - ResidualSugar -
## Chlorides, data = traini)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.94395 -0.14341 -0.04317 0.09580 1.03118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.434e-03 2.916e-02 -0.221 0.82533
## VolatileAcidity 1.981e-02 3.604e-03 5.495 3.98e-08 ***
## FreeSulfurDioxide -6.406e-05 1.947e-05 -3.290 0.00100 **
## TotalSulfurDioxide -9.072e-05 1.251e-05 -7.253 4.31e-13 ***
## pH 1.860e-02 4.219e-03 4.408 1.05e-05 ***
## Sulphates 1.039e-02 3.179e-03 3.268 0.00108 **
## Alcohol 2.478e-03 7.780e-04 3.185 0.00145 **
## LabelAppeal 1.442e-01 5.737e-03 25.136 < 2e-16 ***
## AcidIndex 4.861e-02 2.179e-03 22.307 < 2e-16 ***
## starsna 5.108e-01 6.554e-03 77.936 < 2e-16 ***
## pHna 3.447e-02 1.629e-02 2.116 0.03436 *
## STARLABEL -9.352e-02 3.812e-03 -24.531 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1015266)
##
## Null deviance: 2149.8 on 12794 degrees of freedom
## Residual deviance: 1297.8 on 12783 degrees of freedom
## AIC: 7056.9
##
## Number of Fisher Scoring iterations: 2
confusionMatrix(traini$TARGETBIN,round(zero$fitted.values))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8986 1075
## 1 842 1892
##
## Accuracy : 0.8502
## 95% CI : (0.8439, 0.8563)
## No Information Rate : 0.7681
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5676
## Mcnemar's Test P-Value : 1.166e-07
##
## Sensitivity : 0.9143
## Specificity : 0.6377
## Pos Pred Value : 0.8932
## Neg Pred Value : 0.6920
## Prevalence : 0.7681
## Detection Rate : 0.7023
## Detection Prevalence : 0.7863
## Balanced Accuracy : 0.7760
##
## 'Positive' Class : 0
##
roc(traini$TARGETBIN,round(zero$fitted.values),plot = TRUE)

##
## Call:
## roc.default(response = traini$TARGETBIN, predictor = round(zero$fitted.values), plot = TRUE)
##
## Data: round(zero$fitted.values) in 10061 controls (traini$TARGETBIN 0) < 2734 cases (traini$TARGETBIN 1).
## Area under the curve: 0.7926
traini$lin2fit<-abs(round(zero$fitted.values)-1)
###stage 2 non zero linear
traini2non<-subset(traini,traini$lin2fit==1)
traini2zero<-subset(traini,traini$lin2fit==0)
lin2<-lm(data=traini2non,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-totalsulfurna-chloridesna-pHna-pH-ResidualSugar-sulphatesna-FixedAcidity-CitricAcid-TotalSulfurDioxide-sugarna-Sulphates-AlcoholBIN-Density-FreeSulfurDioxideBIN)
wts2 <- 1/fitted(lm(abs(residuals(lin2)) ~ fitted(lin2)))^2
lin2w<-lm(data=traini2non, TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-totalsulfurna-chloridesna-pHna-pH-ResidualSugar-sulphatesna-FixedAcidity-CitricAcid-TotalSulfurDioxide-sugarna-Sulphates-AlcoholBIN-Density-FreeSulfurDioxideBIN,weights=wts2)
summary(lin2w)
##
## Call:
## lm(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - STARS -
## LabelAppeal - freesulfurna - totalsulfurna - chloridesna -
## pHna - pH - ResidualSugar - sulphatesna - FixedAcidity -
## CitricAcid - TotalSulfurDioxide - sugarna - Sulphates - AlcoholBIN -
## Density - FreeSulfurDioxideBIN, data = traini2non, weights = wts2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -7.4850 -0.6208 0.1571 0.8562 3.9230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.741e+00 9.428e-02 18.470 < 2e-16 ***
## VolatileAcidity -7.806e-02 1.442e-02 -5.414 6.30e-08 ***
## Chlorides -7.774e-02 3.648e-02 -2.131 0.0331 *
## FreeSulfurDioxide 1.964e-04 7.729e-05 2.541 0.0111 *
## Alcohol 1.958e-02 3.101e-03 6.313 2.86e-10 ***
## AcidIndex -1.373e-01 1.028e-02 -13.361 < 2e-16 ***
## starsna -1.596e+00 7.448e-02 -21.429 < 2e-16 ***
## STARLABEL 6.875e-01 7.545e-03 91.127 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.346 on 9820 degrees of freedom
## Multiple R-squared: 0.5068, Adjusted R-squared: 0.5064
## F-statistic: 1441 on 7 and 9820 DF, p-value: < 2.2e-16
vif(lin2w)
## VolatileAcidity Chlorides FreeSulfurDioxide Alcohol
## 1.004213 1.001164 1.001893 1.008642
## AcidIndex starsna STARLABEL
## 1.022330 1.046381 1.032497
traini2non$lin2fit<-round(lin2w$fitted.values)
traini2<-rbind(traini2non,traini2zero)
#write.csv(data.frame(traini2$TARGET,traini2$lin2fit),"mod2s.csv")
hist(traini2$lin2fit)

Poisson regular
pois1<-glm(data=traini,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-FixedAcidity-chloridesna-ResidualSugar-sulphatesna-totalsulfurna-AlcoholBIN-FreeSulfurDioxideBIN-CitricAcid-sugarna-pHna-Density-pH,family=poisson(link=log))
summary(pois1)
##
## Call:
## glm(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - STARS -
## LabelAppeal - freesulfurna - FixedAcidity - chloridesna -
## ResidualSugar - sulphatesna - totalsulfurna - AlcoholBIN -
## FreeSulfurDioxideBIN - CitricAcid - sugarna - pHna - Density -
## pH, family = poisson(link = log), data = traini)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2410 -0.6298 0.0106 0.4475 3.7279
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.142e+00 4.155e-02 27.489 < 2e-16 ***
## VolatileAcidity -3.152e-02 6.517e-03 -4.837 1.32e-06 ***
## Chlorides -3.737e-02 1.647e-02 -2.270 0.023237 *
## FreeSulfurDioxide 9.616e-05 3.505e-05 2.743 0.006086 **
## TotalSulfurDioxide 8.086e-05 2.273e-05 3.557 0.000375 ***
## Sulphates -1.196e-02 5.752e-03 -2.079 0.037645 *
## Alcohol 3.695e-03 1.405e-03 2.629 0.008564 **
## AcidIndex -8.129e-02 4.486e-03 -18.123 < 2e-16 ***
## starsna -1.023e+00 1.697e-02 -60.302 < 2e-16 ***
## STARLABEL 1.735e-01 3.542e-03 48.979 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 22861 on 12794 degrees of freedom
## Residual deviance: 13781 on 12785 degrees of freedom
## AIC: 45743
##
## Number of Fisher Scoring iterations: 6
summary.glm(pois1)$deviance/summary.glm(pois1)$df[2]
## [1] 1.077923
#Goodness of fit
1-pchisq(summary(pois1)$deviance,summary(pois1)$df.residual)
## [1] 6.027558e-10
##Poison Model Does not fit the data
## Now compare with TARGET
mod3pred<-round(exp(pois1$linear.predictors))
hist(mod3pred)

min(mod3pred)
## [1] 1
#write.csv(data.frame(traini$TARGET,mod3pred),"mod3x.csv")
Poisson Overcount Zero
#Double backwards selection ##Binomial on zero, poisson on rest
pois2<-zeroinfl(data=traini,formula = TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-AlcoholBIN-ResidualSugar-Sulphates-FixedAcidity-pHna-chloridesna-totalsulfurna-CitricAcid-sulphatesna-sugarna-FreeSulfurDioxideBIN-TotalSulfurDioxide-pH-Chlorides-FreeSulfurDioxide-Density |
STARLABEL+AcidIndex+FreeSulfurDioxide+TotalSulfurDioxide+Sulphates+VolatileAcidity+pH+starsna+Alcohol
,dist="poisson")
summary(pois2)
##
## Call:
## zeroinfl(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit -
## STARS - LabelAppeal - freesulfurna - AlcoholBIN - ResidualSugar -
## Sulphates - FixedAcidity - pHna - chloridesna - totalsulfurna -
## CitricAcid - sulphatesna - sugarna - FreeSulfurDioxideBIN -
## TotalSulfurDioxide - pH - Chlorides - FreeSulfurDioxide - Density |
## STARLABEL + AcidIndex + FreeSulfurDioxide + TotalSulfurDioxide +
## Sulphates + VolatileAcidity + pH + starsna + Alcohol, data = traini,
## dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.090075 -0.407716 0.007393 0.386758 6.848127
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.674053 0.044271 15.226 < 2e-16 ***
## VolatileAcidity -0.015204 0.006796 -2.237 0.0253 *
## Alcohol 0.005817 0.001448 4.016 5.91e-05 ***
## AcidIndex -0.020869 0.005029 -4.150 3.32e-05 ***
## starsna -0.164173 0.018647 -8.804 < 2e-16 ***
## STARLABEL 0.177612 0.003774 47.059 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.5745224 0.3976833 -21.561 < 2e-16 ***
## STARLABEL 0.1120401 0.0402139 2.786 0.00533 **
## AcidIndex 0.4754128 0.0258817 18.369 < 2e-16 ***
## FreeSulfurDioxide -0.0005850 0.0002491 -2.348 0.01885 *
## TotalSulfurDioxide -0.0010312 0.0001572 -6.558 5.44e-11 ***
## Sulphates 0.1215485 0.0398164 3.053 0.00227 **
## VolatileAcidity 0.2078098 0.0459767 4.520 6.19e-06 ***
## pH 0.2128390 0.0530388 4.013 6.00e-05 ***
## starsna 3.7826763 0.1172953 32.249 < 2e-16 ***
## Alcohol 0.0201650 0.0099526 2.026 0.04275 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 25
## Log-likelihood: -2.097e+04 on 16 Df
mod4pred<-round(pois2$fitted.values)
hist(mod4pred)

#write.csv(data.frame(pois2$fitted.values,mod4pred,traini$TARGET),"mod4.csv")
Negative Binomial Regular
##Backwards selection
negbin1<-glm.nb(data = traini,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-FreeSulfurDioxideBIN-AlcoholBIN-FixedAcidity-ResidualSugar-chloridesna-sulphatesna-freesulfurna-totalsulfurna-CitricAcid-sugarna-pHna-Density-pH)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(negbin1)
##
## Call:
## glm.nb(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit -
## STARS - LabelAppeal - FreeSulfurDioxideBIN - AlcoholBIN -
## FixedAcidity - ResidualSugar - chloridesna - sulphatesna -
## freesulfurna - totalsulfurna - CitricAcid - sugarna - pHna -
## Density - pH, data = traini, init.theta = 40608.34916, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2409 -0.6298 0.0106 0.4475 3.7277
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.142e+00 4.155e-02 27.489 < 2e-16 ***
## VolatileAcidity -3.152e-02 6.518e-03 -4.837 1.32e-06 ***
## Chlorides -3.738e-02 1.647e-02 -2.269 0.023238 *
## FreeSulfurDioxide 9.616e-05 3.506e-05 2.743 0.006087 **
## TotalSulfurDioxide 8.086e-05 2.273e-05 3.557 0.000375 ***
## Sulphates -1.196e-02 5.752e-03 -2.079 0.037645 *
## Alcohol 3.694e-03 1.405e-03 2.629 0.008568 **
## AcidIndex -8.130e-02 4.486e-03 -18.122 < 2e-16 ***
## starsna -1.023e+00 1.697e-02 -60.301 < 2e-16 ***
## STARLABEL 1.735e-01 3.542e-03 48.976 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(40608.35) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 13781 on 12785 degrees of freedom
## AIC: 45746
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 40608
## Std. Err.: 34537
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -45723.68
##Compare effectiveness
mod5pred<-round(negbin1$fitted.values)
hist(round(negbin1$fitted.values))

#write.csv(data.frame(mod5pred,traini$TARGET),"mod5.csv")
Negative Binomial Zero Inflated
traini$starsna<-as.numeric(is.na(train$STARS))
negbin2<-zeroinfl(data=traini,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-totalsulfurna-freesulfurna-chloridesna-FixedAcidity-Sulphates-sugarna-pHna-sulphatesna-FreeSulfurDioxideBIN-CitricAcid-AlcoholBIN-ResidualSugar-Chlorides-FreeSulfurDioxide-pH-TotalSulfurDioxide-Density | VolatileAcidity+FreeSulfurDioxide+TotalSulfurDioxide+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARLABEL+starsna ,dist="negbin")
summary(negbin2)
##
## Call:
## zeroinfl(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit -
## STARS - LabelAppeal - totalsulfurna - freesulfurna - chloridesna -
## FixedAcidity - Sulphates - sugarna - pHna - sulphatesna - FreeSulfurDioxideBIN -
## CitricAcid - AlcoholBIN - ResidualSugar - Chlorides - FreeSulfurDioxide -
## pH - TotalSulfurDioxide - Density | VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARLABEL + starsna, data = traini, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.20275 -0.44024 -0.05755 0.35819 5.51461
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.698092 0.043709 15.971 < 2e-16 ***
## VolatileAcidity -0.012344 0.006709 -1.840 0.065761 .
## Alcohol 0.006015 0.001436 4.188 2.82e-05 ***
## AcidIndex -0.017280 0.004830 -3.578 0.000347 ***
## starsna -0.196335 0.018591 -10.561 < 2e-16 ***
## STARLABEL 0.168912 0.003648 46.302 < 2e-16 ***
## Log(theta) 17.934730 4.562331 3.931 8.46e-05 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.7031729 1.2211015 4.671 3.00e-06 ***
## VolatileAcidity 0.1860825 0.0435265 4.275 1.91e-05 ***
## FreeSulfurDioxide -0.0007810 0.0002404 -3.249 0.001157 **
## TotalSulfurDioxide -0.0009614 0.0001514 -6.348 2.18e-10 ***
## pH 0.2111613 0.0507055 4.164 3.12e-05 ***
## Sulphates 0.1312926 0.0384437 3.415 0.000637 ***
## Alcohol 0.0285343 0.0095047 3.002 0.002681 **
## LabelAppeal 4.6312995 0.3968576 11.670 < 2e-16 ***
## AcidIndex 0.4323461 0.0257968 16.760 < 2e-16 ***
## STARLABEL -3.9304336 0.3916674 -10.035 < 2e-16 ***
## starsna 6.0368628 0.3919153 15.403 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 61511218.4491
## Number of iterations in BFGS optimization: 65
## Log-likelihood: -2.046e+04 on 18 Df
hist(round(negbin2$fitted.values))

#write.csv(data.frame(round(negbin2$fitted.values),traini$TARGET),"mod6x.csv")
Choose 2 Stage GLM for Zeros then weighted linear
##First NA's flags
exp$starsna<-as.numeric(is.na(exp$STARS))
exp$pHna<-as.numeric(is.na(exp$pH))
exp$FreeSulfurDioxideBIN<-as.numeric(is.na(exp$FreeSulfurDioxide))
exp$AlcoholBIN<-as.numeric(is.na(exp$Alcohol))
exp$starsna<-as.numeric(is.na(exp$STARS))
exp$chloridesna<-as.numeric(is.na(exp$Chlorides))
exp$freesulfurna<-as.numeric(is.na(exp$FreeSulfurDioxide))
exp$totalsulfurna<-as.numeric(is.na(exp$TotalSulfurDioxide))
exp$sulphatesna<-as.numeric(is.na(exp$Sulphates))
exp$sugarna<-as.numeric(is.na(exp$ResidualSugar))
#then, impute exp
for(i in 3:ncol(exp)){
exp[is.na(exp[,i]), i] <- median(as.numeric(exp[,i]), na.rm = TRUE)
}
##Combine Label+Stars
exp$STARLABEL<-exp$LabelAppeal+exp$STARS+2
exp$ï..INDEX<-exp$IN
exp$lin2fit<-exp$IN
##Implement GLM Logistic Model for zeros
exp$zeropredict<-round(inv.logit(-1.74129468148971+-0.0780591939241337*exp$VolatileAcidity+-0.0777399671334692*exp$Chlorides+0.000196374725721529*exp$FreeSulfurDioxide+0.0195786126347615*exp$Alcohol+-0.137313163068271*exp$AcidIndex+0.687544417167114*exp$STARLABEL+-1.59605840998015*exp$starsna))
exp2zero<-subset(exp,exp$zeropredict==1)
exp2zero$TARGET<-0
exp2non<-subset(exp,exp$zeropredict==0)
##Now implement the next model on the nonzero.
exp2non$TARGET<-round(1.74129468148971+-0.0780591939241337*exp2non$VolatileAcidity+-0.0777399671334692*exp2non$Chlorides+0.000196374725721529*exp2non$FreeSulfurDioxide+0.0195786126347615*exp2non$Alcohol+-0.137313163068271*exp2non$AcidIndex+0.687544417167114*exp2non$STARLABEL+-1.59605840998015*exp2non$starsna)
exp2done<-rbind(exp2non,exp2zero)
hist(exp2done$TARGET)
