Descriptive Statistics, Density Plots, Boxplots, & ANOVA to analyze Direct Marketing Data

# Required Libraries
library(ggplot2)
library(colorspace)
library(car)
suppressMessages(library(sm))

Preview Direct Market Data

# load data set
data.original <- read.csv('C:\\Users\\Jenn\\Documents\\Data Mining\\DirectMarketing.csv', header = TRUE)

# Preview first 10 obs
knitr::kable(head(data.original, n=10), digits = 3, align = 'c')

Age	Gender	OwnHome	Married	Location	Salary	Children	History	Catalogs	AmountSpent
Old	Female	Own	Single	Far	47500	0	High	6	755
Middle	Male	Rent	Single	Close	63600	0	High	6	1318
Young	Female	Rent	Single	Close	13500	0	Low	18	296
Middle	Male	Own	Married	Close	85600	1	High	18	2436
Middle	Female	Own	Single	Close	68400	0	High	12	1304
Young	Male	Own	Married	Close	30400	0	Low	6	495
Middle	Female	Rent	Single	Close	48100	0	Medium	12	782
Middle	Male	Own	Single	Close	68400	0	High	18	1155
Middle	Female	Own	Married	Close	51900	3	Low	6	158
Old	Male	Own	Married	Far	80700	0	NA	18	3034

## data type
variable.names(data.original)

##  [1] "Age"         "Gender"      "OwnHome"     "Married"     "Location"   
##  [6] "Salary"      "Children"    "History"     "Catalogs"    "AmountSpent"

# Age - categorical  (ordinal)
# Gender - categorical  (nominal)
# OwnHome - categorical  (nominal)
# Married - categorical  (nominal)
# Location - categorical  (nominal)
# Salary - numerical  (continuous)
# Children - numerical  (discrete)
# History - categorical  (ordinal)
# Catalogs- numerical  (discrete)
# AmountSpent- numerical  (continuous)

Check for Missing Data

summary(data.original$History)   #NA means that this customer has not yet purchased

##   High    Low Medium   NA's 
##    255    230    212    303

## we wish to remove customers w/ NO History because these customers do NOT help predict AmountSpent
dim(data.original)

## [1] 1000   10

data.reduced <- (na.omit(data.original))   # removes NA's
dim(data.reduced)

## [1] 697  10

#summary(data.reduced$History)
# list rows of data that have missing values
data.missing<-data.original[!complete.cases(data.original),]

Summary statistics for each variable

summary(data.original)  # full dataset

##      Age         Gender    OwnHome       Married     Location  
##  Middle:508   Female:506   Own :516   Married:502   Close:710  
##  Old   :205   Male  :494   Rent:484   Single :498   Far  :290  
##  Young :287                                                    
##                                                                
##                                                                
##                                                                
##      Salary          Children       History       Catalogs    
##  Min.   : 10100   Min.   :0.000   High  :255   Min.   : 6.00  
##  1st Qu.: 29975   1st Qu.:0.000   Low   :230   1st Qu.: 6.00  
##  Median : 53700   Median :1.000   Medium:212   Median :12.00  
##  Mean   : 56104   Mean   :0.934   NA's  :303   Mean   :14.68  
##  3rd Qu.: 77025   3rd Qu.:2.000                3rd Qu.:18.00  
##  Max.   :168800   Max.   :3.000                Max.   :24.00  
##   AmountSpent    
##  Min.   :  38.0  
##  1st Qu.: 488.2  
##  Median : 962.0  
##  Mean   :1216.8  
##  3rd Qu.:1688.5  
##  Max.   :6217.0

sd.original = c(sd(data.original$AmountSpent),sd(data.original$Salary), sd(data.original$Children), sd(data.original$Catalog) )
summary(data.reduced)   # data w/o missing values

##      Age         Gender    OwnHome       Married     Location  
##  Middle:363   Female:357   Own :369   Married:369   Close:491  
##  Old   :169   Male  :340   Rent:328   Single :328   Far  :206  
##  Young :165                                                    
##                                                                
##                                                                
##                                                                
##      Salary          Children       History       Catalogs    
##  Min.   : 10100   Min.   :0.000   High  :255   Min.   : 6.00  
##  1st Qu.: 33200   1st Qu.:0.000   Low   :230   1st Qu.:12.00  
##  Median : 55200   Median :1.000   Medium:212   Median :12.00  
##  Mean   : 58055   Mean   :0.901                Mean   :15.11  
##  3rd Qu.: 79900   3rd Qu.:2.000                3rd Qu.:18.00  
##  Max.   :168800   Max.   :3.000                Max.   :24.00  
##   AmountSpent  
##  Min.   :  38  
##  1st Qu.: 472  
##  Median : 918  
##  Mean   :1207  
##  3rd Qu.:1702  
##  Max.   :6217

Density Plots & Histograms

par(mfrow = c(2, 2))
# Full Data Set
plot(density(data.original$AmountSpent), main='Density Plot for Amount Spent', xlab='Amount Spent')
hist(data.original$AmountSpent, main = 'Histogram for Amount Spent', xlab='Amount Spent')
plot(density(data.original$Salary), main='Density Plot for Salary', xlab='Salary')
hist(data.original$Salary, main = 'Histogram for Salary', xlab='Salary')

# Reduced Data Set
plot(density(data.reduced$AmountSpent), main='Density Plot for Amount Spent', xlab='Amount Spent')
hist(data.reduced$AmountSpent, main = 'Histogram for Amount Spent', xlab='Amount Spent')
plot(density(data.reduced$Salary), main='Density Plot for Salary', xlab='Salary')
hist(data.reduced$Salary, main = 'Histogram for Salary', xlab='Salary')

Correlation Matrix

# Full dataset
numeric.variables.original<-cbind(data.original$Salary, data.original$Children, data.original$Catalogs, data.original$AmountSpent)
correlation.matrix.original<-cor(numeric.variables.original, numeric.variables.original)
  rownames(correlation.matrix.original) <- c("Salary", "Children", "Catalogs", "Amount Spent")
  colnames(correlation.matrix.original) <- c("Salary", "Children", "Catalogs", "Amount Spent")
print(correlation.matrix.original)

##                  Salary    Children   Catalogs Amount Spent
## Salary       1.00000000  0.04966316  0.1835509    0.6995957
## Children     0.04966316  1.00000000 -0.1134554   -0.2223082
## Catalogs     0.18355086 -0.11345543  1.0000000    0.4726499
## Amount Spent 0.69959571 -0.22230817  0.4726499    1.0000000

# Reduced dataset (removing missing values)
numeric.variables.reduced<-cbind(data.reduced$Salary, data.reduced$Children, data.reduced$Catalogs, data.reduced$AmountSpent)
correlation.matrix.reduced<-cor(numeric.variables.reduced, numeric.variables.reduced)
rownames(correlation.matrix.reduced) <- c("Salary", "Children", "Catalogs", "Amount Spent")
colnames(correlation.matrix.reduced) <- c("Salary", "Children", "Catalogs", "Amount Spent")
print(correlation.matrix.reduced)

##                  Salary    Children   Catalogs Amount Spent
## Salary       1.00000000  0.06557328  0.2280786    0.6638265
## Children     0.06557328  1.00000000 -0.1301653   -0.3221111
## Catalogs     0.22807856 -0.13016533  1.0000000    0.5125304
## Amount Spent 0.66382653 -0.32211111  0.5125304    1.0000000

Scatterplot Matrix

#Full dataset
pairs(~ AmountSpent+Salary+Children+Catalogs,data=data.original,
      main="Simple Scatterplot Matrix")

#Reduced dataset (removing missing values)
pairs(~ AmountSpent+Salary+Children+Catalogs,data=data.reduced,
      main="Simple Scatterplot Matrix")

Conditional Density Plots

categorical.variables.original<-c(data.original$Age, data.original$Gender, data.original$OwnHome, data.original$Married, data.original$Location, data.original$History)

par(mfrow = c(1, 2))
mycolor<-c("Blue", "Green","Red")


# Age
AmountSpent.Age <- factor(data.original$Age, levels= c("Young","Middle","Old"))
sm.density.compare(data.original$AmountSpent, data.original$Age, xlab='Age',col=mycolor)  # plot densities
title(main="Density of Amount Spent by Age")
cdplot(factor(Age, levels=c("Young","Middle","Old"))~ AmountSpent, data=data.original, ylab='Age',col=mycolor)  # ordered ages

# Gender
AmountSpent.Gender <- factor(data.original$Gender, levels= c("Female","Male"))
sm.density.compare(data.original$AmountSpent, data.original$Gender, xlab='Gender',col=mycolor)  # plot densities
title(main="Density of Amount Spent by Gender")
cdplot(Gender ~ AmountSpent, data=data.original, col=mycolor)

# Own Home
AmountSpent.OwnHome <- factor(data.original$OwnHome, levels= c("Own","Rent"))
sm.density.compare(data.original$AmountSpent, data.original$OwnHome, xlab='OwnHome',col=mycolor)  # plot densities
title(main="Density of Amount Spent by OwnHome")
cdplot(OwnHome~ AmountSpent, data=data.original, col=mycolor)

# Married
AmountSpent.Married <- factor(data.original$Married, levels= c("Married","Single"))
sm.density.compare(data.original$AmountSpent, data.original$Married, xlab='Married',col=mycolor)  # plot densities
title(main="Density of Amount Spent by Married")
cdplot(Married~ AmountSpent, data=data.original, col=mycolor)

# Location
AmountSpent.Location <- factor(data.original$Location, levels= c("Close","Far"))
sm.density.compare(data.original$AmountSpent, data.original$Location, xlab='Location',col=mycolor)  # plot densities
title(main="Density of Amount Spent by Location")
cdplot(Location~ AmountSpent, data=data.original, col=mycolor)

# History
AmountSpent.History <- factor(data.original$History, levels= c("Low","Medium","High"))
sm.density.compare(data.original$AmountSpent, data.original$History, main='Density of Amount Spent by History', xlab='History', col=mycolor)  # plot densities

## missing data are removed
## missing data are removed
## missing data are removed
## missing data are removed

title(main="Density of Amount Spent by History")
cdplot(factor(History, levels=c("Low","Medium","High"))~ AmountSpent, data=data.original, ylab = 'History',col=mycolor)   # ordered history

ANOVA tests to compare each Categorical Predictor Variable

## Age
par(mfrow = c(1, 1))
mycolor<-c("Blue", "Green","Red")
boxplot(AmountSpent ~ factor(Age, levels=c("Young","Middle","Old")),data=data.original, 
        main="Boxplots Comparing Age & Amount Spent", xlab="Age", ylab="Amount Spent", col=mycolor)

# split data into age categories: young/middle/old
data.young <- subset(data.original, data.original$Age=='Young')
data.middle <- subset(data.original, data.original$Age=='Middle')
data.old <- subset(data.original, data.original$Age=='Old')
# summary statistics
summary(data.young$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    38.0   248.5   422.0   558.6   699.0  3688.0

summary(data.middle$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   157.0   815.8  1320.0  1502.0  2008.0  5878.0

summary(data.old$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      65     638    1120    1432    1985    6217

# test for equal variances
#var.test(data.young$AmountSpent,data.middle$AmountSpent)   # unequal pop variances
#var.test(data.young$AmountSpent,data.old$AmountSpent)   # unequal pop variances
#var.test(data.middle$AmountSpent,data.old$AmountSpent)

# One-way ANOVA F-test for equal pop means
oneway.test(data.original$AmountSpent ~ data.original$Age, var.equal = FALSE)     # at least one pop mean is significantly different

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  data.original$AmountSpent and data.original$Age
## F = 208.08, num df = 2.00, denom df = 477.07, p-value < 2.2e-16

## Gender 
# display boxplot
boxplot(AmountSpent ~ factor(Gender, levels=c("Female","Male")),data=data.original, 
        main="Boxplots Comparing Gender & Amount Spent", xlab="Gender", ylab="Amount Spent", col=mycolor)

# split data into gender categories: female/male
data.female <- subset(data.original, data.original$Gender=='Female')
data.male <- subset(data.original, data.original$Gender=='Male')
# summary statistics
summary(data.female$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    47.0   364.2   706.0  1025.0  1442.0  5830.0

summary(data.male$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    38.0   688.8  1216.0  1413.0  1902.0  6217.0

# test for equal variances
#var.test(data.female$AmountSpent,data.male$AmountSpent)  # equal pop variances

# One-way ANOVA F-test for equal pop means (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$Gender, var.equal = TRUE)

## 
##  One-way analysis of means
## 
## data:  data.original$AmountSpent and data.original$Gender
## F = 42.319, num df = 1, denom df = 998, p-value = 1.224e-10

## OwnHome
boxplot(AmountSpent ~ factor(OwnHome, levels=c("Rent","Own")),data=data.original, 
        main="Boxplots Comparing Home Ownership & Amount Spent", xlab="Home Ownership", ylab="Amount Spent", col=mycolor)

# split data into home ownership categories: rent/own
data.rent <- subset(data.original, data.original$OwnHome=='Rent')
data.own <- subset(data.original, data.original$OwnHome=='Own')
# summary statistics
summary(data.rent$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    38.0   327.5   623.0   868.8  1187.0  5830.0

summary(data.own$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    65.0   751.8  1360.0  1543.0  2075.0  6217.0

# test for equal variances
#var.test(data.rent$AmountSpent,data.own$AmountSpent)
# unequal pop variances

# One-way ANOVA F-test for equal pop means  (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$OwnHome, var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  data.original$AmountSpent and data.original$OwnHome
## F = 142.98, num df = 1.00, denom df = 934.01, p-value < 2.2e-16

## Married
boxplot(AmountSpent ~ factor(Married, levels=c("Married","Single")),data=data.original, 
        main="Boxplots Comparing Marriage & Amount Spent",xlab="Marriage", ylab="Amount Spent", col=mycolor)

# split data into marriage categories: married/single
data.married <- subset(data.original, data.original$Married=='Married')
data.single <- subset(data.original, data.original$Married=='Single')
# summary statistics
summary(data.married$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    93.0   864.8  1515.0  1672.0  2246.0  6217.0

summary(data.single$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    38.0   322.5   576.0   757.8  1011.0  4182.0

# test for equal variances
#var.test(data.married$AmountSpent,data.single$AmountSpent)    # unequal pop variances
# One-way ANOVA F-test for equal pop means  (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$Married, var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  data.original$AmountSpent and data.original$Married
## F = 293.37, num df = 1.00, denom df = 797.33, p-value < 2.2e-16

##Location
boxplot(AmountSpent ~ factor(Location, levels=c("Close","Far")),data=data.original, 
        main="Boxplots Comparing Location & Amount Spent",xlab="Location", ylab="Amount Spent", col=mycolor)

# split data into location categories: close/far
data.close <- subset(data.original, data.original$Location=='Close')
data.far <- subset(data.original, data.original$Location=='Far')
# summary statistics
summary(data.close$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    38.0   439.8   858.5  1062.0  1497.0  4984.0

summary(data.far$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     123     663    1317    1596    2260    6217

# test for equal variances
#var.test(data.close$AmountSpent,data.far$AmountSpent)    # unequal pop variances

# One-way ANOVA F-test for equal pop means (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$Location, var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  data.original$AmountSpent and data.original$Location
## F = 50.185, num df = 1.00, denom df = 404.96, p-value = 6.239e-12

## History
boxplot(AmountSpent ~ factor(History, levels=c("Low","Medium","High","NA")),data=data.original, 
        main="Boxplots Comparing History & Amount Spent", xlab="History", ylab="Amount Spent", col=mycolor)

# split data into history categories: low/medium/high
data.low <- subset(data.original, data.original$History=='Low')
data.medium <- subset(data.original, data.original$History=='Medium')
data.high <- subset(data.original, data.original$History=='High')
data.NA <- subset(data.original, data.original$History=='NA')
# summary statistics
summary(data.low$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    38.0   210.0   305.5   357.1   471.5  1120.0

summary(data.medium$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   438.0   703.8   894.0   950.4  1149.0  2090.0

summary(data.high$AmountSpent)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     565    1522    1974    2186    2628    6217

# test for equal variances
#var.test(data.low$AmountSpent,data.medium$AmountSpent)    # unequal pop variances
#var.test(data.low$AmountSpent,data.high$AmountSpent)    # unequal pop variances
#var.test(data.medium$AmountSpent,data.high$AmountSpent)    # unequal pop variances

# One-way ANOVA F-test for equal pop means
oneway.test(data.reduced$AmountSpent ~ data.reduced$History, var.equal = FALSE)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  data.reduced$AmountSpent and data.reduced$History
## F = 620.91, num df = 2.0, denom df = 399.7, p-value < 2.2e-16

Descriptive Statistics, Density Plots, Boxplots, & ANOVA to analyze Direct Marketing Data

Jenn Murphy

December 27, 2015

Preview Direct Market Data

Check for Missing Data

Summary statistics for each variable

Density Plots & Histograms

Correlation Matrix

Scatterplot Matrix

Conditional Density Plots

ANOVA tests to compare each Categorical Predictor Variable