#{r set-options, echo=FALSE, cache=FALSE} #options(width = 12) #

Github Master files: https://github.com/asmozo24/Data621_HW5_Count_Regression

Overview

In this homework assignment, we will explore, analyze and model a data set containing information on approximately 12,000 commercially available wines.The variables are mostly related to the chemical properties of the wine being sold. The response variable is the number of sample cases of wine that were purchased by wine distribution companies after sampling a wine. These cases would be used to provide tasting samples to restaurants and wine stores around the United States. The more sample cases purchased, the more likely is a wine to be sold at a high end restaurant.

A large wine manufacturer is studying the data in order to predict the number of wine cases ordered based upon the wine characteristics. If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales.

Our objective is to build a count regression model to predict the number of cases of wine that will be sold given certain properties of the wine. HINT: Sometimes, the fact that a variable is missing is actually predictive of the target.

1. Data Exploration

There are 02 datasets: wine-training-data and insurance-evaluation-data provided by Instructor:Nasrin Khansari. These are csv files and we used R-programming language to acquire the 02 datasets pre-stored in Github repository. These 24 variables of interest are all predictors except the variables called “TARGET_FLAG”,“TARGET_AMT” which are the response variable, and are already defined within the dataset package(see below). The case study: to predict the probability that a person will crash their car and also the amount of money it will cost if the person does crash their car.

Below is a short description of the variables of interest in the data set:

VARIABLE.NAME DEFINITION THEORETICAL.EFFECT X X.1 X.2
INDEX Identification Variable (do not use) None NA NA NA
TARGET Number of Cases Purchased None NA NA NA
AcidIndex Proprietary method of testing total acidity of wine by using a weighted average NA NA NA
Alcohol Alcohol Content NA NA NA
Chlorides Chloride content of wine NA NA NA
CitricAcid Citric Acid Content NA NA NA
Density Density of Wine NA NA NA
FixedAcidity Fixed Acidity of Wine NA NA NA
FreeSulfurDioxide Sulfur Dioxide content of wine NA NA NA
LabelAppeal Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customes don’t like the design. Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales. NA NA NA
ResidualSugar Residual Sugar of wine NA NA NA
STARS Wine rating by a team of experts. 4 Stars = Excellent, 1 Star = Poor A high number of stars suggests high sales NA NA NA
Sulphates Sulfate conten of wine NA NA NA
TotalSulfurDioxide Total Sulfur Dioxide of Wine NA NA NA
VolatileAcidity Volatile Acid content of wine NA NA NA
pH pH of wine NA NA NA

Data Structure

These datasets include 12795 observations and 16 variables. The variables are all numerical data type. There are variables (predictors) that might need to change data type if we will use them to build the different models.

ï..INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol LabelAppeal AcidIndex STARS
1 3 3.2 1.160 -0.98 54.2 -0.567 NA 268 0.99280 3.33 -0.59 9.9 0 8 2
2 3 4.5 0.160 -0.81 26.1 -0.425 15 -327 1.02792 3.38 0.70 NA -1 7 3
4 5 7.1 2.640 -0.88 14.8 0.037 214 142 0.99518 3.12 0.48 22.0 -1 8 3
5 3 5.7 0.385 0.04 18.8 -0.425 22 115 0.99640 2.24 1.83 6.2 -1 6 1
6 4 8.0 0.330 -1.26 9.4 NA -167 108 0.99457 3.12 1.77 13.7 0 9 2
7 0 11.3 0.320 0.59 2.2 0.556 -37 15 0.99940 3.20 1.29 15.4 0 11 NA
## 'data.frame':    12795 obs. of  16 variables:
##  $ ï..INDEX          : int  1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET            : int  3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity      : num  3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
##  $ VolatileAcidity   : num  1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
##  $ CitricAcid        : num  -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
##  $ ResidualSugar     : num  54.2 26.1 14.8 18.8 9.4 ...
##  $ Chlorides         : num  -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
##  $ FreeSulfurDioxide : num  NA 15 214 22 -167 -37 287 523 -213 62 ...
##  $ TotalSulfurDioxide: num  268 -327 142 115 108 15 156 551 NA 180 ...
##  $ Density           : num  0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                : num  3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Sulphates         : num  -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
##  $ Alcohol           : num  9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal       : int  0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex         : int  8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS             : int  2 3 3 1 2 NA NA 3 NA 4 ...

Summary of wine-training-data

#library(gtsummary)
#library(kableExtra)
#summary (wineT_df)

library(pastecs)
## Warning:
## package
## 'pastecs'
## was built
## under R
## version
## 4.0.5
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following objects are masked from 'package:dplyr':
## 
##     first,
##     last
#stat.desc(wineT_df)

options(scipen=10)
options(digits=2)
#stat.desc(wineT_df, basic=T,desc=T, norm=FALSE, p=0.95)
# select (data =wineT_df, TARGET, FixedAcidity, VolatileAcidity, CitricAcid, ResidualSugar, Chlorides, FreeSulfurDioxide)
s1 <- dplyr::select (wineT_df, TARGET, FixedAcidity, VolatileAcidity, CitricAcid, ResidualSugar, Chlorides, FreeSulfurDioxide)
stat.desc( s1, norm=FALSE, p=0.95) #basic=T,desc=F
#wineT_df0 <- as_tibble(wineT_df)
 #print(stat.desc(wineT_df))
#View(stat.desc(wineT_df))
# code only group for summary by grouping

# # install.packages("devtools")
# devtools::install_github("AlineTalhouk/Amisc")
# 
# library(Amisc)
# library(pander)
# pander::pandoc.table(Amisc::describeBy(
#   data = wineT_df,
#   var.names = "ï..INDEX",
#   by1 = c("ï..INDEX", "TARGET", "FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "Density", "pH", "Sulphates", "Alcohol", "LabelAppeal", "AcidIndex", "STARS"),
#   dispersion = "sd", Missing = TRUE,
#   stats = "non-parametric"
# ),
# split.tables = Inf
# )

wineT_df1 <- wineT_df