rm(list = ls())
packages <- c("psych",
"stargazer",
"tidyverse",
"corrplot",
"ggplot2",
"data.table",
"car",
"MASS",
"Metrics",
"vars",
"gptstudio",
"visdat",
"reshape2",
"corrplot",
"pscl"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## Warning: 程辑包'psych'是用R版本4.3.1 来建造的
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
##
##
## 载入程辑包:'data.table'
##
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
##
## The following object is masked from 'package:purrr':
##
## transpose
##
##
## 载入需要的程辑包:carData
##
##
## 载入程辑包:'car'
##
##
## The following object is masked from 'package:dplyr':
##
## recode
##
##
## The following object is masked from 'package:purrr':
##
## some
##
##
## The following object is masked from 'package:psych':
##
## logit
##
##
##
## 载入程辑包:'MASS'
##
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
## 载入需要的程辑包:strucchange
##
## 载入需要的程辑包:zoo
##
##
## 载入程辑包:'zoo'
##
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## 载入需要的程辑包:sandwich
##
##
## 载入程辑包:'strucchange'
##
##
## The following object is masked from 'package:stringr':
##
## boundary
##
##
## 载入需要的程辑包:urca
##
## 载入需要的程辑包:lmtest
##
##
## 载入程辑包:'reshape2'
##
##
## The following objects are masked from 'package:data.table':
##
## dcast, melt
##
##
## The following object is masked from 'package:tidyr':
##
## smiths
##
##
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
Originaldf <- read.csv("D:/Study/Econometric/Assignment/Wine/wine-training-data.csv")
set.seed(123)
# Generate a vector of indices for the training set
train_index <- sample(x = nrow(Originaldf), size = round(0.8 * nrow(Originaldf) ) )
# Create the training and testing sets
train <- Originaldf[train_index, ]
test <- Originaldf[-train_index, ]
mylabel <- c( 'INDEX' = 'Index',
'TARGET' = '#Purchased',
'AcidIndex' = 'Method',
'Alcohol' = 'Alcohol',
'Chlorides' = 'Chloride',
'CitricAcid' = 'CitricAcid',
'Density' = 'Density',
'FixedAcidity' = 'FixAcid',
'FreeSulfurDioxide' = 'SulfurDioxide',
'LabelAppeal' = 'MarketScore',
'ResidualSugar' = 'Sugar',
'STARS' = 'Rating',
'Sulphates' = 'Sulfate',
'TotalSulfurDioxide' = 'SumSulDioxide',
'VolatileAcidity' = 'VolatileAcid',
'pH' = 'pHValue'
)
vis_dat(x = train) #visualize the data
vis_miss(x = train)
in the graph we can see in the dataset there are only two types of data and there is 4% of missing values.
#make the dataset become long for plotting.
trainlong <- melt(train)
## No id variables; using all as measure variables
#plot the histogram of each variable
ggplot(trainlong, aes(x=value)) +
geom_histogram(binwidth=1, fill="blue", color="black") +
facet_wrap(~variable, scales="free", labeller = as_labeller(mylabel)) +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 6578 rows containing non-finite values (`stat_bin()`).
#count the number of missing values in each variable
dplyr::summarise(.data = train,
across(.cols = everything(),
.fns = ~ sum(is.na(.x))
)
) %>%
glimpse()
## Rows: 1
## Columns: 16
## $ INDEX <int> 0
## $ TARGET <int> 0
## $ FixedAcidity <int> 0
## $ VolatileAcidity <int> 0
## $ CitricAcid <int> 0
## $ ResidualSugar <int> 494
## $ Chlorides <int> 525
## $ FreeSulfurDioxide <int> 539
## $ TotalSulfurDioxide <int> 532
## $ Density <int> 0
## $ pH <int> 321
## $ Sulphates <int> 960
## $ Alcohol <int> 519
## $ LabelAppeal <int> 0
## $ AcidIndex <int> 0
## $ STARS <int> 2688
#calculate the percentage of missing values in each variable
round(x = sapply(X = train,
FUN = function(df){100*sum(is.na(df==TRUE)/length(df))}),
digits = 2)
## INDEX TARGET FixedAcidity VolatileAcidity
## 0.00 0.00 0.00 0.00
## CitricAcid ResidualSugar Chlorides FreeSulfurDioxide
## 0.00 4.83 5.13 5.27
## TotalSulfurDioxide Density pH Sulphates
## 5.20 0.00 3.14 9.38
## Alcohol LabelAppeal AcidIndex STARS
## 5.07 0.00 0.00 26.26
In the histogram plot, we can see the average number of purchase is around 4, the fix acidity average is about 5. The average Score of the wine is around 0. The most common rating of the wine is 2 and 1. The NAs value are in pH, Sulphates, Alcohol, ResidualSugar, Chlorides, FreeSulfurDioxide, TotalSulfurDioxide and Stars.
corrplot(cor(train, use = "na.or.complete"),
type = "lower",
method = "square"
)
In the correlation plots we can see Label Appeal and wine stars has a strong relationship with number of Purchased, Stars has strong relationship with marketing score
#remove all NA values from the train dataset
trainC <- na.omit(train)
#print the structure of the train dataset
str(trainC)
## 'data.frame': 5147 obs. of 16 variables:
## $ INDEX : int 3172 3772 2319 11765 4253 8527 12393 3491 6449 11520 ...
## $ TARGET : int 3 3 3 4 4 4 0 0 6 6 ...
## $ FixedAcidity : num -2.4 9.1 5.7 5.5 5.6 8.1 16.5 6.5 10.2 -3.3 ...
## $ VolatileAcidity : num 1.08 0.98 0.21 0.3 0.485 0.27 -1.72 0.19 -0.5 0.25 ...
## $ CitricAcid : num 0.25 0.58 1.32 1.2 0.16 0.33 1.79 0.49 -0.34 0.27 ...
## $ ResidualSugar : num -10 33.6 1.6 67.6 12.6 ...
## $ Chlorides : num -0.051 0.036 0.03 0.029 0.051 0.32 0.028 0.04 0.238 0.039 ...
## $ FreeSulfurDioxide : num 321 44 277 28 193 171 65 31 23 37 ...
## $ TotalSulfurDioxide: num 138 161 122 97 115 100 158 466 -526 482 ...
## $ Density : num 1 1.03 0.99 1.02 1.02 ...
## $ pH : num 4.47 3.06 3.94 3.23 4.15 4.04 3.26 4.28 3.2 3.27 ...
## $ Sulphates : num -1.05 0.41 0.52 0.44 0.38 0.44 -0.47 0.68 0.39 -0.83 ...
## $ Alcohol : num 4.8 8.9 18.3 21.7 12 15.9 10.7 11.9 7.34 16.2 ...
## $ LabelAppeal : int 0 -1 -1 0 0 0 -1 0 1 0 ...
## $ AcidIndex : int 7 9 6 8 6 9 7 7 7 7 ...
## $ STARS : int 2 2 2 2 3 3 1 1 4 4 ...
## - attr(*, "na.action")= 'omit' Named int [1:5089] 1 3 4 5 10 11 17 18 20 21 ...
## ..- attr(*, "names")= chr [1:5089] "2463" "10419" "8718" "12483" ...
As the list above, to clean the missing value I have to drop almost 50% of my data, but I think it is worthy that if I impute those missing data with mean or median I will lose the accuracy of this study.
Poisson1 <- glm(data = trainC, family = poisson, trainC$TARGET ~. -trainC$INDEX )
summary(Poisson1)
##
## Call:
## glm(formula = trainC$TARGET ~ . - trainC$INDEX, family = poisson,
## data = trainC)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.594e+00 2.799e-01 5.695 1.24e-08 ***
## INDEX 1.680e-06 1.572e-06 1.069 0.28520
## FixedAcidity -1.914e-04 1.177e-03 -0.163 0.87083
## VolatileAcidity -2.623e-02 9.339e-03 -2.808 0.00498 **
## CitricAcid 2.977e-03 8.458e-03 0.352 0.72486
## ResidualSugar -9.270e-05 2.169e-04 -0.427 0.66905
## Chlorides -3.885e-02 2.294e-02 -1.694 0.09034 .
## FreeSulfurDioxide 8.268e-05 4.959e-05 1.667 0.09543 .
## TotalSulfurDioxide 1.440e-05 3.225e-05 0.446 0.65525
## Density -3.850e-01 2.751e-01 -1.399 0.16171
## pH -8.063e-03 1.084e-02 -0.744 0.45691
## Sulphates -1.846e-03 7.848e-03 -0.235 0.81401
## Alcohol 2.938e-03 1.975e-03 1.487 0.13697
## LabelAppeal 1.725e-01 8.806e-03 19.595 < 2e-16 ***
## AcidIndex -4.600e-02 6.535e-03 -7.039 1.93e-12 ***
## STARS 1.889e-01 8.337e-03 22.655 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4634.3 on 5146 degrees of freedom
## Residual deviance: 3186.6 on 5131 degrees of freedom
## AIC: 18539
##
## Number of Fisher Scoring iterations: 5
The coefficient represent the change in the log count for each unit increase in the predictor, assuming the other predictors are held constant. For example, the coefficient for Alcohol is 2.938e-03. This means that for each unit increase in Alcohol, the log count of the TARGET variable is expected to increase by 2.938e-03, while all other variables are held constant. From this output, it appears that LabelAppeal, AcidIndex, and STARS are highly significant predictors. VolatileAcidity is also significant with a p-value less than 0.01. Other variables such as Alcohol, Density, FreeSulfurDioxide, and Chlorides are not statistically significant at common thresholds (like 0.05), suggesting they might not have a substantial effect on the TARGET variable in this model.
Poisson2 <- glm(data = trainC, family = poisson, trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + trainC$Alcohol -trainC$INDEX )
summary(Poisson2)
##
## Call:
## glm(formula = trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal +
## trainC$Alcohol - trainC$INDEX, family = poisson, data = trainC)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.819404 0.027444 29.857 <2e-16 ***
## trainC$STARS 0.195858 0.008262 23.706 <2e-16 ***
## trainC$LabelAppeal 0.170212 0.008770 19.409 <2e-16 ***
## trainC$Alcohol 0.003734 0.001967 1.898 0.0577 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4634.3 on 5146 degrees of freedom
## Residual deviance: 3258.9 on 5143 degrees of freedom
## AIC: 18587
##
## Number of Fisher Scoring iterations: 4
In summary, STARS, LabelAppeal, and Alcohol are all significant predictors, with STARS and LabelAppeal being highly significant.
binomial1 <- glm.nb(data = trainC, trainC$TARGET ~. -trainC$INDEX)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(binomial1)
##
## Call:
## glm.nb(formula = trainC$TARGET ~ . - trainC$INDEX, data = trainC,
## init.theta = 140865.1965, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.594e+00 2.799e-01 5.694 1.24e-08 ***
## INDEX 1.680e-06 1.572e-06 1.069 0.28520
## FixedAcidity -1.914e-04 1.177e-03 -0.163 0.87084
## VolatileAcidity -2.623e-02 9.339e-03 -2.808 0.00498 **
## CitricAcid 2.977e-03 8.458e-03 0.352 0.72486
## ResidualSugar -9.270e-05 2.169e-04 -0.427 0.66905
## Chlorides -3.885e-02 2.294e-02 -1.694 0.09034 .
## FreeSulfurDioxide 8.268e-05 4.959e-05 1.667 0.09544 .
## TotalSulfurDioxide 1.440e-05 3.225e-05 0.446 0.65525
## Density -3.850e-01 2.751e-01 -1.399 0.16171
## pH -8.063e-03 1.084e-02 -0.744 0.45692
## Sulphates -1.846e-03 7.849e-03 -0.235 0.81401
## Alcohol 2.938e-03 1.975e-03 1.487 0.13697
## LabelAppeal 1.725e-01 8.806e-03 19.595 < 2e-16 ***
## AcidIndex -4.600e-02 6.535e-03 -7.039 1.93e-12 ***
## STARS 1.889e-01 8.338e-03 22.654 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(140865.2) family taken to be 1)
##
## Null deviance: 4634.2 on 5146 degrees of freedom
## Residual deviance: 3186.5 on 5131 degrees of freedom
## AIC: 18541
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 140865
## Std. Err.: 264201
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18506.62
The variables LabelAppeal, AcidIndex, and STARS are highly statistically significant. The large value of theta suggests that there might be overdispersion in the data, which means that there is more variation in the data than would be expected based on the Negative Binomial model.
binomial2 <- glm.nb(data = trainC, trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + trainC$Alcohol -trainC$INDEX )
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(binomial2)
##
## Call:
## glm.nb(formula = trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal +
## trainC$Alcohol - trainC$INDEX, data = trainC, init.theta = 141467.724,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.819403 0.027445 29.856 <2e-16 ***
## trainC$STARS 0.195858 0.008262 23.705 <2e-16 ***
## trainC$LabelAppeal 0.170212 0.008770 19.409 <2e-16 ***
## trainC$Alcohol 0.003734 0.001967 1.898 0.0577 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(141467.7) family taken to be 1)
##
## Null deviance: 4634.2 on 5146 degrees of freedom
## Residual deviance: 3258.8 on 5143 degrees of freedom
## AIC: 18589
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 141468
## Std. Err.: 267865
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18578.91
Both STARS and LabelAppeal have very low p-values, indicating that these predictors are significantly associated with the target variable. The Alcohol variable has a p-value close to the usual 0.05 significance level. This suggests that Alcohol may have a marginal effect on the target variable.
Linear1 <- lm(data = trainC, trainC$TARGET ~. -trainC$INDEX)
summary(Linear1)
##
## Call:
## lm(formula = trainC$TARGET ~ . - trainC$INDEX, data = trainC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0127 -0.5129 0.1165 0.7209 3.1918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.528e+00 6.169e-01 7.341 2.46e-13 ***
## INDEX 5.388e-06 3.481e-06 1.548 0.12174
## FixedAcidity -6.378e-06 2.606e-03 -0.002 0.99805
## VolatileAcidity -9.849e-02 2.061e-02 -4.779 1.81e-06 ***
## CitricAcid 9.579e-03 1.875e-02 0.511 0.60942
## ResidualSugar -3.450e-04 4.794e-04 -0.720 0.47180
## Chlorides -1.393e-01 5.087e-02 -2.738 0.00620 **
## FreeSulfurDioxide 2.844e-04 1.096e-04 2.594 0.00952 **
## TotalSulfurDioxide 5.342e-05 7.104e-05 0.752 0.45209
## Density -1.274e+00 6.063e-01 -2.101 0.03568 *
## pH -2.261e-02 2.396e-02 -0.943 0.34554
## Sulphates -5.686e-03 1.733e-02 -0.328 0.74286
## Alcohol 1.298e-02 4.354e-03 2.980 0.00290 **
## LabelAppeal 6.281e-01 1.933e-02 32.486 < 2e-16 ***
## AcidIndex -1.561e-01 1.370e-02 -11.398 < 2e-16 ***
## STARS 7.325e-01 1.904e-02 38.472 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.154 on 5131 degrees of freedom
## Multiple R-squared: 0.4416, Adjusted R-squared: 0.4399
## F-statistic: 270.5 on 15 and 5131 DF, p-value: < 2.2e-16
Based on this output, it appears that LabelAppeal, AcidIndex, and STARS are the strongest predictors of TARGET. Variables like VolatileAcidity, Chlorides, FreeSulfurDioxide, Density, and Alcohol are also statistically significant at less stringent levels, but their effects are smaller.
Linear2 <- lm(data = trainC, trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + trainC$Alcohol -trainC$INDEX )
summary(Linear2)
##
## Call:
## lm(formula = trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal +
## trainC$Alcohol - trainC$INDEX, data = trainC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0040 -0.5211 0.1321 0.7502 3.1332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.937985 0.060686 31.934 < 2e-16 ***
## trainC$STARS 0.759083 0.019223 39.488 < 2e-16 ***
## trainC$LabelAppeal 0.620253 0.019606 31.635 < 2e-16 ***
## trainC$Alcohol 0.015291 0.004411 3.467 0.000531 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.173 on 5143 degrees of freedom
## Multiple R-squared: 0.4213, Adjusted R-squared: 0.421
## F-statistic: 1248 on 3 and 5143 DF, p-value: < 2.2e-16
In conclusion, this model suggests that STARS, LabelAppeal, and Alcohol are significant predictors of TARGET. Among these, STARS has the highest impact, followed by LabelAppeal and then Alcohol.
stargazer(binomial1, binomial2, Poisson1,Poisson2,Linear1, Linear2, type = "text", title = "Model Summary")
##
## Model Summary
## ====================================================================================================================================================
## Dependent variable:
## --------------------------------------------------------------------------------------------------------------------------------
## TARGET
## negative Poisson OLS
## binomial
## (1) (2) (3) (4) (5) (6)
## ----------------------------------------------------------------------------------------------------------------------------------------------------
## INDEX 0.00000 0.00000 0.00001
## (0.00000) (0.00000) (0.00000)
##
## FixedAcidity -0.0002 -0.0002 -0.00001
## (0.001) (0.001) (0.003)
##
## VolatileAcidity -0.026*** -0.026*** -0.098***
## (0.009) (0.009) (0.021)
##
## CitricAcid 0.003 0.003 0.010
## (0.008) (0.008) (0.019)
##
## ResidualSugar -0.0001 -0.0001 -0.0003
## (0.0002) (0.0002) (0.0005)
##
## Chlorides -0.039* -0.039* -0.139***
## (0.023) (0.023) (0.051)
##
## FreeSulfurDioxide 0.0001* 0.0001* 0.0003***
## (0.00005) (0.00005) (0.0001)
##
## TotalSulfurDioxide 0.00001 0.00001 0.0001
## (0.00003) (0.00003) (0.0001)
##
## Density -0.385 -0.385 -1.274**
## (0.275) (0.275) (0.606)
##
## pH -0.008 -0.008 -0.023
## (0.011) (0.011) (0.024)
##
## Sulphates -0.002 -0.002 -0.006
## (0.008) (0.008) (0.017)
##
## Alcohol 0.003 0.003 0.013***
## (0.002) (0.002) (0.004)
##
## LabelAppeal 0.173*** 0.173*** 0.628***
## (0.009) (0.009) (0.019)
##
## AcidIndex -0.046*** -0.046*** -0.156***
## (0.007) (0.007) (0.014)
##
## STARS 0.189*** 0.189*** 0.733***
## (0.008) (0.008) (0.019)
##
## STARS 0.196*** 0.196*** 0.759***
## (0.008) (0.008) (0.019)
##
## LabelAppeal 0.170*** 0.170*** 0.620***
## (0.009) (0.009) (0.020)
##
## Alcohol 0.004* 0.004* 0.015***
## (0.002) (0.002) (0.004)
##
## Constant 1.594*** 0.819*** 1.594*** 0.819*** 4.528*** 1.938***
## (0.280) (0.027) (0.280) (0.027) (0.617) (0.061)
##
## ----------------------------------------------------------------------------------------------------------------------------------------------------
## Observations 5,147 5,147 5,147 5,147 5,147 5,147
## R2 0.442 0.421
## Adjusted R2 0.440 0.421
## Log Likelihood -9,254.311 -9,290.457 -9,253.269 -9,289.415
## theta 140,865.200 (264,201.100) 141,467.700 (267,865.100)
## Akaike Inf. Crit. 18,540.620 18,588.910 18,538.540 18,586.830
## Residual Std. Error 1.154 (df = 5131) 1.173 (df = 5143)
## F Statistic 270.488*** (df = 15; 5131) 1,248.199*** (df = 3; 5143)
## ====================================================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
In all the model, Stars and number of purchased are significance predictor which make sense that people tend to buy wine that has higher rating and if they like that wine, they will give it a good rate which form a good circulation. Also, I have the same result for my Poisson and binomial model which I guess is because the Poisson regression model assumes that the mean and variance of the distribution are equaland the negative binomial regression model is a generalization of the Poisson regression model that allows for overdispersion, where the variance is greater than the mean. So this could means that the data is not overdispersed and thus fits well to the Poisson distribution’s assumption of equidispersion.
# Calculate the pR2 for the binomial1 model
b1R2 <- pR2(binomial1)
## fitting null model for pseudo-r2
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
b1R2
## llh llhNull G2 McFadden r2ML
## -9.253311e+03 -9.977172e+03 1.447722e+03 7.255172e-02 2.451792e-01
## r2CU
## 2.503656e-01
# Calculate the pR2 for the binomial2 model
b2R2 <- pR2(binomial2)
## fitting null model for pseudo-r2
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
b2R2
## llh llhNull G2 McFadden r2ML
## -9289.4567713 -9977.1722408 1375.4309390 0.0689289 0.2345027
## r2CU
## 0.2394633
# Calculate the pR2 for the Poisson1 model
P1R2 <- pR2(Poisson1)
## fitting null model for pseudo-r2
P1R2
## llh llhNull G2 McFadden r2ML
## -9.253269e+03 -9.977139e+03 1.447739e+03 7.255282e-02 2.451817e-01
## r2CU
## 2.503682e-01
# Calculate the pR2 for the Poisson2 model
P2R2 <- pR2(Poisson2)
## fitting null model for pseudo-r2
P2R2
## llh llhNull G2 McFadden r2ML
## -9289.4154484 -9977.1386049 1375.4463130 0.0689299 0.2345050
## r2CU
## 0.2394657
Based on the outputs, the best model appears to be the multiple linear regression model 2 because it has fewer predictos which makes it easier to interpret and less likely to overfit the data. Also this model has an adjusted R-squared of 0.421, which suggests that it explains about 42.1% of the variability in TARGET. Although this isn’t dramatically higher than some of the other models, it is quite good considering the model’s simplicity.
predictedbuys <-predict(object = Linear2,
newdata = test,
interval = "prediction")
## Warning: 'newdata'必需有2559行 但变量里有5147行
summary(predictedbuys)
## fit lwr upr
## Min. :1.458 Min. :-0.8451 Min. :3.761
## 1st Qu.:2.891 1st Qu.: 0.5901 1st Qu.:5.192
## Median :3.612 Median : 1.3119 Median :5.912
## Mean :3.674 Mean : 1.3729 Mean :5.975
## 3rd Qu.:4.358 3rd Qu.: 2.0576 3rd Qu.:6.659
## Max. :6.577 Max. : 4.2727 Max. :8.882