#{r set-options, echo=FALSE, cache=FALSE} #options(width = 12) #
Github Master files: https://github.com/asmozo24/Data621_HW5_Count_Regression
Overview
In this homework assignment, we will explore, analyze and model a data set containing information on approximately 12,000 commercially available wines.The variables are mostly related to the chemical properties of the wine being sold. The response variable is the number of sample cases of wine that were purchased by wine distribution companies after sampling a wine. These cases would be used to provide tasting samples to restaurants and wine stores around the United States. The more sample cases purchased, the more likely is a wine to be sold at a high end restaurant.
A large wine manufacturer is studying the data in order to predict the number of wine cases ordered based upon the wine characteristics. If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales.
Our objective is to build a count regression model to predict the number of cases of wine that will be sold given certain properties of the wine. HINT: Sometimes, the fact that a variable is missing is actually predictive of the target.
There are 02 datasets: wine-training-data and insurance-evaluation-data provided by Instructor:Nasrin Khansari. These are csv files and we used R-programming language to acquire the 02 datasets pre-stored in Github repository. These 24 variables of interest are all predictors except the variables called “TARGET_FLAG”,“TARGET_AMT” which are the response variable, and are already defined within the dataset package(see below). The case study: to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car.
Below is a short description of the variables of interest in the data set:
VARIABLE.NAME | DEFINITION | THEORETICAL.EFFECT | X | X.1 | X.2 |
---|---|---|---|---|---|
INDEX | Identification Variable (do not use) | None | NA | NA | NA |
TARGET | Number of Cases Purchased | None | NA | NA | NA |
AcidIndex | Proprietary method of testing total acidity of wine by using a weighted average | NA | NA | NA | |
Alcohol | Alcohol Content | NA | NA | NA | |
Chlorides | Chloride content of wine | NA | NA | NA | |
CitricAcid | Citric Acid Content | NA | NA | NA | |
Density | Density of Wine | NA | NA | NA | |
FixedAcidity | Fixed Acidity of Wine | NA | NA | NA | |
FreeSulfurDioxide | Sulfur Dioxide content of wine | NA | NA | NA | |
LabelAppeal | Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customes don’t like the design. | Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales. | NA | NA | NA |
ResidualSugar | Residual Sugar of wine | NA | NA | NA | |
STARS | Wine rating by a team of experts. 4 Stars = Excellent, 1 Star = Poor | A high number of stars suggests high sales | NA | NA | NA |
Sulphates | Sulfate conten of wine | NA | NA | NA | |
TotalSulfurDioxide | Total Sulfur Dioxide of Wine | NA | NA | NA | |
VolatileAcidity | Volatile Acid content of wine | NA | NA | NA | |
pH | pH of wine | NA | NA | NA |
These datasets include 12795 observations and 16 variables. The variables are all numerical data type. There are variables (predictors) that might need to change data type if we will use them to build the different models.
ï..INDEX | TARGET | FixedAcidity | VolatileAcidity | CitricAcid | ResidualSugar | Chlorides | FreeSulfurDioxide | TotalSulfurDioxide | Density | pH | Sulphates | Alcohol | LabelAppeal | AcidIndex | STARS |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 3 | 3.2 | 1.160 | -0.98 | 54.2 | -0.567 | NA | 268 | 0.99280 | 3.33 | -0.59 | 9.9 | 0 | 8 | 2 |
2 | 3 | 4.5 | 0.160 | -0.81 | 26.1 | -0.425 | 15 | -327 | 1.02792 | 3.38 | 0.70 | NA | -1 | 7 | 3 |
4 | 5 | 7.1 | 2.640 | -0.88 | 14.8 | 0.037 | 214 | 142 | 0.99518 | 3.12 | 0.48 | 22.0 | -1 | 8 | 3 |
5 | 3 | 5.7 | 0.385 | 0.04 | 18.8 | -0.425 | 22 | 115 | 0.99640 | 2.24 | 1.83 | 6.2 | -1 | 6 | 1 |
6 | 4 | 8.0 | 0.330 | -1.26 | 9.4 | NA | -167 | 108 | 0.99457 | 3.12 | 1.77 | 13.7 | 0 | 9 | 2 |
7 | 0 | 11.3 | 0.320 | 0.59 | 2.2 | 0.556 | -37 | 15 | 0.99940 | 3.20 | 1.29 | 15.4 | 0 | 11 | NA |
## 'data.frame': 12795 obs. of 16 variables:
## $ ï..INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET : int 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
## $ VolatileAcidity : num 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
## $ CitricAcid : num -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
## $ ResidualSugar : num 54.2 26.1 14.8 18.8 9.4 ...
## $ Chlorides : num -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
## $ FreeSulfurDioxide : num NA 15 214 22 -167 -37 287 523 -213 62 ...
## $ TotalSulfurDioxide: num 268 -327 142 115 108 15 156 551 NA 180 ...
## $ Density : num 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Sulphates : num -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
## $ Alcohol : num 9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : int 2 3 3 1 2 NA NA 3 NA 4 ...
Summary of wine-training-data
#library(gtsummary)
#library(kableExtra)
#summary (wineT_df)
library(pastecs)
## Warning:
## package
## 'pastecs'
## was built
## under R
## version
## 4.0.5
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
## The following objects are masked from 'package:dplyr':
##
## first,
## last
#stat.desc(wineT_df)
options(scipen=10)
options(digits=2)
#stat.desc(wineT_df, basic=T,desc=T, norm=FALSE, p=0.95)
# select (data =wineT_df, TARGET, FixedAcidity, VolatileAcidity, CitricAcid, ResidualSugar, Chlorides, FreeSulfurDioxide)
s1 <- dplyr::select (wineT_df, TARGET, FixedAcidity, VolatileAcidity, CitricAcid, ResidualSugar, Chlorides, FreeSulfurDioxide)
stat.desc( s1, norm=FALSE, p=0.95) #basic=T,desc=F
#wineT_df0 <- as_tibble(wineT_df)
#print(stat.desc(wineT_df))
#View(stat.desc(wineT_df))
# code only group for summary by grouping
# # install.packages("devtools")
# devtools::install_github("AlineTalhouk/Amisc")
#
# library(Amisc)
# library(pander)
# pander::pandoc.table(Amisc::describeBy(
# data = wineT_df,
# var.names = "ï..INDEX",
# by1 = c("ï..INDEX", "TARGET", "FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "pH", "Sulphates", "Alcohol", "LabelAppeal", "AcidIndex", "STARS"),
# dispersion = "sd", Missing = TRUE,
# stats = "non-parametric"
# ),
# split.tables = Inf
# )
wineT_df1 <- wineT_df