Homework 5

Data Preparation

#glm(y ~ offset(log(exposure)) + x, family=poisson(link=log) )
#glm.nb
#zeroinflated  #zeroinfl()
#Conwell-Maxwell-Poisson 

library(MASS)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(corrplot)
library(pscl)

## Classes and Methods for R developed in the

## Political Science Computational Laboratory

## Department of Political Science

## Stanford University

## Simon Jackman

## hurdle and zeroinfl functions by Achim Zeileis

library(cramer)

## Loading required package: boot

## 
## Attaching package: 'boot'

## The following object is masked from 'package:lattice':
## 
##     melanoma

exp<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/wine-evaluation-data.csv")
train<-read.csv("https://raw.githubusercontent.com/scottogden10/621_hw/master/wine-training-data.csv")

hist(train$TARGET,col="violet",xlab="",main = "Histogram of Wine Counts")

mean(train$TARGET)

## [1] 3.029074

sd(train$TARGET)

## [1] 1.926368

##Not obnoxiously underdispursed.
##Zero Inflated



corrplot(as.matrix(cor(train[-1],use = "pairwise.complete")),method = "shade")

##Run a KS test on distribitions of TARGET based on nulls for certain variables, to determine if we should use them as binary predictors

##Do x and y come from the same distribution?  Lower when different, Higher when same
pvals<-c(1-ks.test(train[is.na(train$ResidualSugar),2],train[!is.na(train$ResidualSugar),2])$p.value,
1-ks.test(train[is.na(train$Chlorides),2],train[!is.na(train$Chlorides),2])$p.value,
1-ks.test(train[is.na(train$FreeSulfurDioxide),2],train[!is.na(train$FreeSulfurDioxide),2])$p.value,
1-ks.test(train[is.na(train$TotalSulfurDioxide),2],train[!is.na(train$TotalSulfurDioxide),2])$p.value,
1-ks.test(train[is.na(train$pH),2],train[!is.na(train$pH),2])$p.value,
1-ks.test(train[is.na(train$Sulphates),2],train[!is.na(train$Sulphates),2])$p.value,
1-ks.test(train[is.na(train$Alcohol),2],train[!is.na(train$Alcohol),2])$p.value,
1-ks.test(train[is.na(train$STARS),2],train[!is.na(train$STARS),2])$p.value)

## Warning in ks.test(train[is.na(train$ResidualSugar), 2], train[!is.na(train
## $ResidualSugar), : p-value will be approximate in the presence of ties

## Warning in ks.test(train[is.na(train$Chlorides), 2], train[!is.na(train
## $Chlorides), : p-value will be approximate in the presence of ties

## Warning in ks.test(train[is.na(train$FreeSulfurDioxide), 2], train[!
## is.na(train$FreeSulfurDioxide), : p-value will be approximate in the
## presence of ties

## Warning in ks.test(train[is.na(train$TotalSulfurDioxide), 2], train[!
## is.na(train$TotalSulfurDioxide), : p-value will be approximate in the
## presence of ties

## Warning in ks.test(train[is.na(train$pH), 2], train[!is.na(train$pH), 2]):
## p-value will be approximate in the presence of ties

## Warning in ks.test(train[is.na(train$Sulphates), 2], train[!is.na(train
## $Sulphates), : p-value will be approximate in the presence of ties

## Warning in ks.test(train[is.na(train$Alcohol), 2], train[!is.na(train
## $Alcohol), : p-value will be approximate in the presence of ties

## Warning in ks.test(train[is.na(train$STARS), 2], train[!is.na(train
## $STARS), : p-value will be approximate in the presence of ties

names<-c("ResidualSugar","Chlorides","FreeSulfur","Total Sulfur","pH","Sulphates","Alcohol","STARS")

data.frame(names,pvals)

##           names        pvals
## 1 ResidualSugar 0.4000466013
## 2     Chlorides 0.3753065208
## 3    FreeSulfur 0.0142159411
## 4  Total Sulfur 0.1437760700
## 5            pH 0.5677559630
## 6     Sulphates 0.1602013270
## 7       Alcohol 0.0006181788
## 8         STARS 1.0000000000

traini<-train
traini$FreeSulfurDioxideBIN<-as.numeric(is.na(traini$FreeSulfurDioxide))
traini$AlcoholBIN<-as.numeric(is.na(traini$Alcohol))
traini$starsna<-as.numeric(is.na(train$STARS))
traini$sugarna<-as.numeric(is.na(train$ResidualSugar))
traini$chloridesna<-as.numeric(is.na(train$Chlorides))
traini$freesulfurna<-as.numeric(is.na(train$FreeSulfurDioxide))
traini$totalsulfurna<-as.numeric(is.na(train$TotalSulfurDioxide))
traini$pHna<-as.numeric(is.na(train$pH))
traini$sulphatesna<-as.numeric(is.na(train$Sulphates))

Combining and Imputing

#imput median
for(i in 3:ncol(traini)){
  traini[is.na(traini[,i]), i] <- median(as.numeric(traini[,i]), na.rm = TRUE)
}

cormat<-as.matrix(cor(traini[-c(1,2,17,18)],use = "pairwise.complete"))
NFeigen<-eigen(cormat)
Nfact<-sum(as.numeric(NFeigen$values >=1))
Nfact

## [1] 10

library(nFactors)

## Loading required package: psych

## 
## Attaching package: 'psych'

## The following object is masked from 'package:boot':
## 
##     logit

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

## 
## Attaching package: 'nFactors'

## The following object is masked from 'package:lattice':
## 
##     parallel

ev <- NFeigen # get eigenvalues
ap <- parallel(subject=nrow(traini[-c(1,2,17,18)]),var=ncol(traini[-c(1,2,17,18)]),
               rep=100,cent=.05)
nS <- nScree(x=ev$values, aparallel=ap$eigen$qevpea)
plotnScree(nS)

componentAxis(cormat, nFactors=6)

## $values
## [1] 1.400333 1.241261 1.067708 1.066332 1.051909 1.038167
## 
## $varExplained
## [1] 6.67 5.91 5.08 5.08 5.01 4.94
## 
## $cumVarExplained
## [1]  6.67 12.58 17.66 22.74 27.75 32.69
## 
## $loadings
##              [,1]         [,2]        [,3]        [,4]        [,5]
##  [1,] -0.34746071  0.428203145 -0.02467452 -0.16837412 -0.23650564
##  [2,] -0.22007217  0.046163111  0.10539815  0.39599637  0.20885358
##  [3,] -0.05090808  0.168652740 -0.24017424 -0.42894452 -0.03939924
##  [4,]  0.07375891 -0.064506720 -0.45369088  0.22140786 -0.19336970
##  [5,] -0.09089572  0.108819357 -0.07635596  0.25302133  0.20102389
##  [6,]  0.09852688 -0.109512176 -0.12239003 -0.04491842 -0.52467487
##  [7,]  0.13864798 -0.173606259 -0.15645510 -0.03407998 -0.36977368
##  [8,] -0.12739178  0.035720573  0.03416182  0.32671312 -0.20442978
##  [9,]  0.06702037 -0.133083914  0.11773136  0.40972820 -0.12990528
## [10,] -0.11681342  0.095001103  0.27543064 -0.07369860 -0.28131563
## [11,]  0.15963922  0.046527455  0.30537939 -0.24865672  0.38100275
## [12,]  0.50610610  0.607385739 -0.04517861  0.13335924 -0.05461606
## [13,] -0.55069022  0.506021811 -0.11228994 -0.08244424 -0.07365447
## [14,]  0.54733816  0.533262711  0.05927593  0.16057715  0.01065830
## [15,] -0.51462651  0.167526461  0.08715146  0.24137994  0.04327930
## [16,]  0.01429524  0.001130028  0.28986068  0.17046689 -0.21866805
## [17,]  0.02475196 -0.010165449  0.51092204 -0.13581881 -0.15890549
## [18,] -0.02157753  0.027880919  0.21372714  0.02542797 -0.10022560
## [19,]  0.01450723  0.003115052  0.22437129  0.03131073 -0.25542342
## [20,]  0.01405612  0.025250520 -0.25755219 -0.07111424  0.07987233
## [21,] -0.02416406 -0.008803681 -0.17280955  0.27614132  0.11352313
##               [,6]
##  [1,]  0.143378781
##  [2,]  0.307455477
##  [3,]  0.128727808
##  [4,]  0.041257341
##  [5,] -0.508737165
##  [6,]  0.241960741
##  [7,] -0.245429980
##  [8,] -0.378662593
##  [9,]  0.385105746
## [10,]  0.200980426
## [11,]  0.156415643
## [12,]  0.006365719
## [13,] -0.015654669
## [14,] -0.004185374
## [15,] -0.013391024
## [16,]  0.085446252
## [17,] -0.145412498
## [18,] -0.214452189
## [19,] -0.150291516
## [20,] -0.048953071
## [21,]  0.266008832
## 
## $communalities
##  [1] 0.40953795 0.35663338 0.29083559 0.30355236 0.38917782 0.37252617
##  [7] 0.27197049 0.31059001 0.36912274 0.22349572 0.35236517 0.64791005
## [13] 0.58439393 0.61337795 0.36081764 0.16840058 0.32659983 0.10360372
## [19] 0.13937166 0.08100152 0.19042678

f<-factanal(traini[-c(1,2,17,18)]
            , factors = 6,
            scores = "Bartlett",rotation = "varimax")

#write.csv(f$correlation,"xzxyt.csv")
traini$STARLABEL<-traini$LabelAppeal+traini$STARS+2

##

Model Building

library(caret)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:boot':
## 
##     logit

traini$lin2fit<-traini$ï..INDEX
traini$TARGETBIN<-traini$ï..INDEX

lin1<-lm(data = traini,TARGET~.-ï..INDEX-STARS-LabelAppeal-lin2fit-sulphatesna-Sulphates-totalsulfurna-FixedAcidity-pHna-CitricAcid-ResidualSugar-FreeSulfurDioxideBIN-freesulfurna-AlcoholBIN-chloridesna-FreeSulfurDioxide-TARGETBIN-pH-Density-sugarna)
wts1 <- 1/fitted(lm(abs(residuals(lin1)) ~ fitted(lin1)))^2
lin1w<-lm(data = traini,TARGET~.-ï..INDEX-STARS-LabelAppeal-lin2fit-sulphatesna-Sulphates-totalsulfurna-FixedAcidity-pHna-CitricAcid-ResidualSugar-FreeSulfurDioxideBIN-freesulfurna-AlcoholBIN-chloridesna-FreeSulfurDioxide-TARGETBIN-pH-Density-sugarna,weights=wts1)

summary(lin1w)

## 
## Call:
## lm(formula = TARGET ~ . - ï..INDEX - STARS - LabelAppeal - lin2fit - 
##     sulphatesna - Sulphates - totalsulfurna - FixedAcidity - 
##     pHna - CitricAcid - ResidualSugar - FreeSulfurDioxideBIN - 
##     freesulfurna - AlcoholBIN - chloridesna - FreeSulfurDioxide - 
##     TARGETBIN - pH - Density - sugarna, data = traini, weights = wts1)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3788 -0.7600  0.0452  0.8674  5.0983 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.218e+00  8.617e-02  25.736  < 2e-16 ***
## VolatileAcidity    -8.475e-02  1.368e-02  -6.197 5.94e-10 ***
## Chlorides          -9.538e-02  3.458e-02  -2.758  0.00582 ** 
## TotalSulfurDioxide  1.448e-04  4.783e-05   3.027  0.00248 ** 
## Alcohol             1.779e-02  2.960e-03   6.012 1.88e-09 ***
## AcidIndex          -1.753e-01  9.206e-03 -19.044  < 2e-16 ***
## starsna            -2.315e+00  3.188e-02 -72.618  < 2e-16 ***
## STARLABEL           6.479e-01  7.343e-03  88.237  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.288 on 12787 degrees of freedom
## Multiple R-squared:  0.5707, Adjusted R-squared:  0.5705 
## F-statistic:  2429 on 7 and 12787 DF,  p-value: < 2.2e-16

vif(lin1w)

##    VolatileAcidity          Chlorides TotalSulfurDioxide 
##           1.005877           1.001563           1.001915 
##            Alcohol          AcidIndex            starsna 
##           1.007459           1.029928           1.048985 
##          STARLABEL 
##           1.030972

##Performance
lin1fit<-lin1w$fitted.values
lin1fit[lin1fit<0]<-0
lin1fit<-round(lin1fit)

hist(lin1fit)

#write.csv(data.frame(lin1fit,traini$TARGET),"mod1.csv")
##Assign any negative values zero, also round

Linear model 2, 2 stage

##Stage 1 zeros
traini$TARGETBIN<-as.numeric(traini$TARGET==0)
  

zero<-glm(data=traini,TARGETBIN~.-ï..INDEX-TARGET-TARGETBIN-lin2fit-traini$lin2fit-STARS-traini$LabelAppeal-freesulfurna-FixedAcidity-sugarna-Density-chloridesna-FreeSulfurDioxideBIN-sulphatesna-AlcoholBIN-totalsulfurna-CitricAcid-ResidualSugar-Chlorides)
summary(zero)

## 
## Call:
## glm(formula = TARGETBIN ~ . - ï..INDEX - TARGET - TARGETBIN - 
##     lin2fit - traini$lin2fit - STARS - traini$LabelAppeal - freesulfurna - 
##     FixedAcidity - sugarna - Density - chloridesna - FreeSulfurDioxideBIN - 
##     sulphatesna - AlcoholBIN - totalsulfurna - CitricAcid - ResidualSugar - 
##     Chlorides, data = traini)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.94395  -0.14341  -0.04317   0.09580   1.03118  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -6.434e-03  2.916e-02  -0.221  0.82533    
## VolatileAcidity     1.981e-02  3.604e-03   5.495 3.98e-08 ***
## FreeSulfurDioxide  -6.406e-05  1.947e-05  -3.290  0.00100 ** 
## TotalSulfurDioxide -9.072e-05  1.251e-05  -7.253 4.31e-13 ***
## pH                  1.860e-02  4.219e-03   4.408 1.05e-05 ***
## Sulphates           1.039e-02  3.179e-03   3.268  0.00108 ** 
## Alcohol             2.478e-03  7.780e-04   3.185  0.00145 ** 
## LabelAppeal         1.442e-01  5.737e-03  25.136  < 2e-16 ***
## AcidIndex           4.861e-02  2.179e-03  22.307  < 2e-16 ***
## starsna             5.108e-01  6.554e-03  77.936  < 2e-16 ***
## pHna                3.447e-02  1.629e-02   2.116  0.03436 *  
## STARLABEL          -9.352e-02  3.812e-03 -24.531  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1015266)
## 
##     Null deviance: 2149.8  on 12794  degrees of freedom
## Residual deviance: 1297.8  on 12783  degrees of freedom
## AIC: 7056.9
## 
## Number of Fisher Scoring iterations: 2

confusionMatrix(traini$TARGETBIN,round(zero$fitted.values))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 8986 1075
##          1  842 1892
##                                           
##                Accuracy : 0.8502          
##                  95% CI : (0.8439, 0.8563)
##     No Information Rate : 0.7681          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5676          
##  Mcnemar's Test P-Value : 1.166e-07       
##                                           
##             Sensitivity : 0.9143          
##             Specificity : 0.6377          
##          Pos Pred Value : 0.8932          
##          Neg Pred Value : 0.6920          
##              Prevalence : 0.7681          
##          Detection Rate : 0.7023          
##    Detection Prevalence : 0.7863          
##       Balanced Accuracy : 0.7760          
##                                           
##        'Positive' Class : 0               
##

roc(traini$TARGETBIN,round(zero$fitted.values),plot = TRUE)

## 
## Call:
## roc.default(response = traini$TARGETBIN, predictor = round(zero$fitted.values),     plot = TRUE)
## 
## Data: round(zero$fitted.values) in 10061 controls (traini$TARGETBIN 0) < 2734 cases (traini$TARGETBIN 1).
## Area under the curve: 0.7926

traini$lin2fit<-abs(round(zero$fitted.values)-1)

###stage 2 non zero linear

traini2non<-subset(traini,traini$lin2fit==1)
traini2zero<-subset(traini,traini$lin2fit==0)

lin2<-lm(data=traini2non,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-totalsulfurna-chloridesna-pHna-pH-ResidualSugar-sulphatesna-FixedAcidity-CitricAcid-TotalSulfurDioxide-sugarna-Sulphates-AlcoholBIN-Density-FreeSulfurDioxideBIN)
wts2 <- 1/fitted(lm(abs(residuals(lin2)) ~ fitted(lin2)))^2
lin2w<-lm(data=traini2non, TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-totalsulfurna-chloridesna-pHna-pH-ResidualSugar-sulphatesna-FixedAcidity-CitricAcid-TotalSulfurDioxide-sugarna-Sulphates-AlcoholBIN-Density-FreeSulfurDioxideBIN,weights=wts2)
summary(lin2w)

## 
## Call:
## lm(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - STARS - 
##     LabelAppeal - freesulfurna - totalsulfurna - chloridesna - 
##     pHna - pH - ResidualSugar - sulphatesna - FixedAcidity - 
##     CitricAcid - TotalSulfurDioxide - sugarna - Sulphates - AlcoholBIN - 
##     Density - FreeSulfurDioxideBIN, data = traini2non, weights = wts2)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4850 -0.6208  0.1571  0.8562  3.9230 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.741e+00  9.428e-02  18.470  < 2e-16 ***
## VolatileAcidity   -7.806e-02  1.442e-02  -5.414 6.30e-08 ***
## Chlorides         -7.774e-02  3.648e-02  -2.131   0.0331 *  
## FreeSulfurDioxide  1.964e-04  7.729e-05   2.541   0.0111 *  
## Alcohol            1.958e-02  3.101e-03   6.313 2.86e-10 ***
## AcidIndex         -1.373e-01  1.028e-02 -13.361  < 2e-16 ***
## starsna           -1.596e+00  7.448e-02 -21.429  < 2e-16 ***
## STARLABEL          6.875e-01  7.545e-03  91.127  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.346 on 9820 degrees of freedom
## Multiple R-squared:  0.5068, Adjusted R-squared:  0.5064 
## F-statistic:  1441 on 7 and 9820 DF,  p-value: < 2.2e-16

vif(lin2w)

##   VolatileAcidity         Chlorides FreeSulfurDioxide           Alcohol 
##          1.004213          1.001164          1.001893          1.008642 
##         AcidIndex           starsna         STARLABEL 
##          1.022330          1.046381          1.032497

traini2non$lin2fit<-round(lin2w$fitted.values)

traini2<-rbind(traini2non,traini2zero)
#write.csv(data.frame(traini2$TARGET,traini2$lin2fit),"mod2s.csv")
hist(traini2$lin2fit)

Poisson regular

pois1<-glm(data=traini,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-FixedAcidity-chloridesna-ResidualSugar-sulphatesna-totalsulfurna-AlcoholBIN-FreeSulfurDioxideBIN-CitricAcid-sugarna-pHna-Density-pH,family=poisson(link=log))
summary(pois1)

## 
## Call:
## glm(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - STARS - 
##     LabelAppeal - freesulfurna - FixedAcidity - chloridesna - 
##     ResidualSugar - sulphatesna - totalsulfurna - AlcoholBIN - 
##     FreeSulfurDioxideBIN - CitricAcid - sugarna - pHna - Density - 
##     pH, family = poisson(link = log), data = traini)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2410  -0.6298   0.0106   0.4475   3.7279  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.142e+00  4.155e-02  27.489  < 2e-16 ***
## VolatileAcidity    -3.152e-02  6.517e-03  -4.837 1.32e-06 ***
## Chlorides          -3.737e-02  1.647e-02  -2.270 0.023237 *  
## FreeSulfurDioxide   9.616e-05  3.505e-05   2.743 0.006086 ** 
## TotalSulfurDioxide  8.086e-05  2.273e-05   3.557 0.000375 ***
## Sulphates          -1.196e-02  5.752e-03  -2.079 0.037645 *  
## Alcohol             3.695e-03  1.405e-03   2.629 0.008564 ** 
## AcidIndex          -8.129e-02  4.486e-03 -18.123  < 2e-16 ***
## starsna            -1.023e+00  1.697e-02 -60.302  < 2e-16 ***
## STARLABEL           1.735e-01  3.542e-03  48.979  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 13781  on 12785  degrees of freedom
## AIC: 45743
## 
## Number of Fisher Scoring iterations: 6

summary.glm(pois1)$deviance/summary.glm(pois1)$df[2]

## [1] 1.077923

#Goodness of fit



1-pchisq(summary(pois1)$deviance,summary(pois1)$df.residual)

## [1] 6.027558e-10

##Poison Model Does not fit the data

## Now compare with TARGET
mod3pred<-round(exp(pois1$linear.predictors))
hist(mod3pred)

min(mod3pred)

## [1] 1

#write.csv(data.frame(traini$TARGET,mod3pred),"mod3x.csv")

Poisson Overcount Zero

#Double backwards selection ##Binomial on zero, poisson on rest

pois2<-zeroinfl(data=traini,formula = TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-freesulfurna-AlcoholBIN-ResidualSugar-Sulphates-FixedAcidity-pHna-chloridesna-totalsulfurna-CitricAcid-sulphatesna-sugarna-FreeSulfurDioxideBIN-TotalSulfurDioxide-pH-Chlorides-FreeSulfurDioxide-Density |
         STARLABEL+AcidIndex+FreeSulfurDioxide+TotalSulfurDioxide+Sulphates+VolatileAcidity+pH+starsna+Alcohol
                  ,dist="poisson")
summary(pois2)

## 
## Call:
## zeroinfl(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - 
##     STARS - LabelAppeal - freesulfurna - AlcoholBIN - ResidualSugar - 
##     Sulphates - FixedAcidity - pHna - chloridesna - totalsulfurna - 
##     CitricAcid - sulphatesna - sugarna - FreeSulfurDioxideBIN - 
##     TotalSulfurDioxide - pH - Chlorides - FreeSulfurDioxide - Density | 
##     STARLABEL + AcidIndex + FreeSulfurDioxide + TotalSulfurDioxide + 
##         Sulphates + VolatileAcidity + pH + starsna + Alcohol, data = traini, 
##     dist = "poisson")
## 
## Pearson residuals:
##       Min        1Q    Median        3Q       Max 
## -2.090075 -0.407716  0.007393  0.386758  6.848127 
## 
## Count model coefficients (poisson with log link):
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.674053   0.044271  15.226  < 2e-16 ***
## VolatileAcidity -0.015204   0.006796  -2.237   0.0253 *  
## Alcohol          0.005817   0.001448   4.016 5.91e-05 ***
## AcidIndex       -0.020869   0.005029  -4.150 3.32e-05 ***
## starsna         -0.164173   0.018647  -8.804  < 2e-16 ***
## STARLABEL        0.177612   0.003774  47.059  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -8.5745224  0.3976833 -21.561  < 2e-16 ***
## STARLABEL           0.1120401  0.0402139   2.786  0.00533 ** 
## AcidIndex           0.4754128  0.0258817  18.369  < 2e-16 ***
## FreeSulfurDioxide  -0.0005850  0.0002491  -2.348  0.01885 *  
## TotalSulfurDioxide -0.0010312  0.0001572  -6.558 5.44e-11 ***
## Sulphates           0.1215485  0.0398164   3.053  0.00227 ** 
## VolatileAcidity     0.2078098  0.0459767   4.520 6.19e-06 ***
## pH                  0.2128390  0.0530388   4.013 6.00e-05 ***
## starsna             3.7826763  0.1172953  32.249  < 2e-16 ***
## Alcohol             0.0201650  0.0099526   2.026  0.04275 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 25 
## Log-likelihood: -2.097e+04 on 16 Df

mod4pred<-round(pois2$fitted.values)
hist(mod4pred)

#write.csv(data.frame(pois2$fitted.values,mod4pred,traini$TARGET),"mod4.csv")

Negative Binomial Regular

##Backwards selection

negbin1<-glm.nb(data = traini,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-FreeSulfurDioxideBIN-AlcoholBIN-FixedAcidity-ResidualSugar-chloridesna-sulphatesna-freesulfurna-totalsulfurna-CitricAcid-sugarna-pHna-Density-pH)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(negbin1)

## 
## Call:
## glm.nb(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - 
##     STARS - LabelAppeal - FreeSulfurDioxideBIN - AlcoholBIN - 
##     FixedAcidity - ResidualSugar - chloridesna - sulphatesna - 
##     freesulfurna - totalsulfurna - CitricAcid - sugarna - pHna - 
##     Density - pH, data = traini, init.theta = 40608.34916, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2409  -0.6298   0.0106   0.4475   3.7277  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.142e+00  4.155e-02  27.489  < 2e-16 ***
## VolatileAcidity    -3.152e-02  6.518e-03  -4.837 1.32e-06 ***
## Chlorides          -3.738e-02  1.647e-02  -2.269 0.023238 *  
## FreeSulfurDioxide   9.616e-05  3.506e-05   2.743 0.006087 ** 
## TotalSulfurDioxide  8.086e-05  2.273e-05   3.557 0.000375 ***
## Sulphates          -1.196e-02  5.752e-03  -2.079 0.037645 *  
## Alcohol             3.694e-03  1.405e-03   2.629 0.008568 ** 
## AcidIndex          -8.130e-02  4.486e-03 -18.122  < 2e-16 ***
## starsna            -1.023e+00  1.697e-02 -60.301  < 2e-16 ***
## STARLABEL           1.735e-01  3.542e-03  48.976  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(40608.35) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 13781  on 12785  degrees of freedom
## AIC: 45746
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  40608 
##           Std. Err.:  34537 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -45723.68

##Compare effectiveness
mod5pred<-round(negbin1$fitted.values)
hist(round(negbin1$fitted.values))

#write.csv(data.frame(mod5pred,traini$TARGET),"mod5.csv")

Negative Binomial Zero Inflated

traini$starsna<-as.numeric(is.na(train$STARS))

negbin2<-zeroinfl(data=traini,TARGET~.-ï..INDEX-TARGETBIN-lin2fit-STARS-LabelAppeal-totalsulfurna-freesulfurna-chloridesna-FixedAcidity-Sulphates-sugarna-pHna-sulphatesna-FreeSulfurDioxideBIN-CitricAcid-AlcoholBIN-ResidualSugar-Chlorides-FreeSulfurDioxide-pH-TotalSulfurDioxide-Density | VolatileAcidity+FreeSulfurDioxide+TotalSulfurDioxide+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARLABEL+starsna ,dist="negbin")
summary(negbin2)

## 
## Call:
## zeroinfl(formula = TARGET ~ . - ï..INDEX - TARGETBIN - lin2fit - 
##     STARS - LabelAppeal - totalsulfurna - freesulfurna - chloridesna - 
##     FixedAcidity - Sulphates - sugarna - pHna - sulphatesna - FreeSulfurDioxideBIN - 
##     CitricAcid - AlcoholBIN - ResidualSugar - Chlorides - FreeSulfurDioxide - 
##     pH - TotalSulfurDioxide - Density | VolatileAcidity + FreeSulfurDioxide + 
##     TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##     AcidIndex + STARLABEL + starsna, data = traini, dist = "negbin")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.20275 -0.44024 -0.05755  0.35819  5.51461 
## 
## Count model coefficients (negbin with log link):
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.698092   0.043709  15.971  < 2e-16 ***
## VolatileAcidity -0.012344   0.006709  -1.840 0.065761 .  
## Alcohol          0.006015   0.001436   4.188 2.82e-05 ***
## AcidIndex       -0.017280   0.004830  -3.578 0.000347 ***
## starsna         -0.196335   0.018591 -10.561  < 2e-16 ***
## STARLABEL        0.168912   0.003648  46.302  < 2e-16 ***
## Log(theta)      17.934730   4.562331   3.931 8.46e-05 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         5.7031729  1.2211015   4.671 3.00e-06 ***
## VolatileAcidity     0.1860825  0.0435265   4.275 1.91e-05 ***
## FreeSulfurDioxide  -0.0007810  0.0002404  -3.249 0.001157 ** 
## TotalSulfurDioxide -0.0009614  0.0001514  -6.348 2.18e-10 ***
## pH                  0.2111613  0.0507055   4.164 3.12e-05 ***
## Sulphates           0.1312926  0.0384437   3.415 0.000637 ***
## Alcohol             0.0285343  0.0095047   3.002 0.002681 ** 
## LabelAppeal         4.6312995  0.3968576  11.670  < 2e-16 ***
## AcidIndex           0.4323461  0.0257968  16.760  < 2e-16 ***
## STARLABEL          -3.9304336  0.3916674 -10.035  < 2e-16 ***
## starsna             6.0368628  0.3919153  15.403  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 61511218.4491 
## Number of iterations in BFGS optimization: 65 
## Log-likelihood: -2.046e+04 on 18 Df

hist(round(negbin2$fitted.values))

#write.csv(data.frame(round(negbin2$fitted.values),traini$TARGET),"mod6x.csv")

Choose 2 Stage GLM for Zeros then weighted linear

##First NA's flags
exp$starsna<-as.numeric(is.na(exp$STARS))
exp$pHna<-as.numeric(is.na(exp$pH))

exp$FreeSulfurDioxideBIN<-as.numeric(is.na(exp$FreeSulfurDioxide))
exp$AlcoholBIN<-as.numeric(is.na(exp$Alcohol))
exp$starsna<-as.numeric(is.na(exp$STARS))
exp$chloridesna<-as.numeric(is.na(exp$Chlorides))
exp$freesulfurna<-as.numeric(is.na(exp$FreeSulfurDioxide))
exp$totalsulfurna<-as.numeric(is.na(exp$TotalSulfurDioxide))
exp$sulphatesna<-as.numeric(is.na(exp$Sulphates))
exp$sugarna<-as.numeric(is.na(exp$ResidualSugar))

#then, impute exp

for(i in 3:ncol(exp)){
  exp[is.na(exp[,i]), i] <- median(as.numeric(exp[,i]), na.rm = TRUE)
}


##Combine Label+Stars
exp$STARLABEL<-exp$LabelAppeal+exp$STARS+2
exp$ï..INDEX<-exp$IN
exp$lin2fit<-exp$IN

##Implement GLM Logistic Model for zeros
exp$zeropredict<-round(inv.logit(-1.74129468148971+-0.0780591939241337*exp$VolatileAcidity+-0.0777399671334692*exp$Chlorides+0.000196374725721529*exp$FreeSulfurDioxide+0.0195786126347615*exp$Alcohol+-0.137313163068271*exp$AcidIndex+0.687544417167114*exp$STARLABEL+-1.59605840998015*exp$starsna))

exp2zero<-subset(exp,exp$zeropredict==1)
exp2zero$TARGET<-0
exp2non<-subset(exp,exp$zeropredict==0)

##Now implement the next model on the nonzero.

exp2non$TARGET<-round(1.74129468148971+-0.0780591939241337*exp2non$VolatileAcidity+-0.0777399671334692*exp2non$Chlorides+0.000196374725721529*exp2non$FreeSulfurDioxide+0.0195786126347615*exp2non$Alcohol+-0.137313163068271*exp2non$AcidIndex+0.687544417167114*exp2non$STARLABEL+-1.59605840998015*exp2non$starsna)

exp2done<-rbind(exp2non,exp2zero)
hist(exp2done$TARGET)

Homework 5

Scott Ogden

May 20, 2017

Data Preparation

Combining and Imputing

Model Building

Linear model 2, 2 stage

Poisson regular

Poisson Overcount Zero

Negative Binomial Regular

Negative Binomial Zero Inflated

Choose 2 Stage GLM for Zeros then weighted linear