# Required Libraries
library(ggplot2)
library(colorspace)
library(car)
suppressMessages(library(sm))
Preview Direct Market Data
# load data set
data.original <- read.csv('C:\\Users\\Jenn\\Documents\\Data Mining\\DirectMarketing.csv', header = TRUE)
# Preview first 10 obs
knitr::kable(head(data.original, n=10), digits = 3, align = 'c')
| Old |
Female |
Own |
Single |
Far |
47500 |
0 |
High |
6 |
755 |
| Middle |
Male |
Rent |
Single |
Close |
63600 |
0 |
High |
6 |
1318 |
| Young |
Female |
Rent |
Single |
Close |
13500 |
0 |
Low |
18 |
296 |
| Middle |
Male |
Own |
Married |
Close |
85600 |
1 |
High |
18 |
2436 |
| Middle |
Female |
Own |
Single |
Close |
68400 |
0 |
High |
12 |
1304 |
| Young |
Male |
Own |
Married |
Close |
30400 |
0 |
Low |
6 |
495 |
| Middle |
Female |
Rent |
Single |
Close |
48100 |
0 |
Medium |
12 |
782 |
| Middle |
Male |
Own |
Single |
Close |
68400 |
0 |
High |
18 |
1155 |
| Middle |
Female |
Own |
Married |
Close |
51900 |
3 |
Low |
6 |
158 |
| Old |
Male |
Own |
Married |
Far |
80700 |
0 |
NA |
18 |
3034 |
## data type
variable.names(data.original)
## [1] "Age" "Gender" "OwnHome" "Married" "Location"
## [6] "Salary" "Children" "History" "Catalogs" "AmountSpent"
# Age - categorical (ordinal)
# Gender - categorical (nominal)
# OwnHome - categorical (nominal)
# Married - categorical (nominal)
# Location - categorical (nominal)
# Salary - numerical (continuous)
# Children - numerical (discrete)
# History - categorical (ordinal)
# Catalogs- numerical (discrete)
# AmountSpent- numerical (continuous)
Check for Missing Data
summary(data.original$History) #NA means that this customer has not yet purchased
## High Low Medium NA's
## 255 230 212 303
## we wish to remove customers w/ NO History because these customers do NOT help predict AmountSpent
dim(data.original)
## [1] 1000 10
data.reduced <- (na.omit(data.original)) # removes NA's
dim(data.reduced)
## [1] 697 10
#summary(data.reduced$History)
# list rows of data that have missing values
data.missing<-data.original[!complete.cases(data.original),]
Summary statistics for each variable
summary(data.original) # full dataset
## Age Gender OwnHome Married Location
## Middle:508 Female:506 Own :516 Married:502 Close:710
## Old :205 Male :494 Rent:484 Single :498 Far :290
## Young :287
##
##
##
## Salary Children History Catalogs
## Min. : 10100 Min. :0.000 High :255 Min. : 6.00
## 1st Qu.: 29975 1st Qu.:0.000 Low :230 1st Qu.: 6.00
## Median : 53700 Median :1.000 Medium:212 Median :12.00
## Mean : 56104 Mean :0.934 NA's :303 Mean :14.68
## 3rd Qu.: 77025 3rd Qu.:2.000 3rd Qu.:18.00
## Max. :168800 Max. :3.000 Max. :24.00
## AmountSpent
## Min. : 38.0
## 1st Qu.: 488.2
## Median : 962.0
## Mean :1216.8
## 3rd Qu.:1688.5
## Max. :6217.0
sd.original = c(sd(data.original$AmountSpent),sd(data.original$Salary), sd(data.original$Children), sd(data.original$Catalog) )
summary(data.reduced) # data w/o missing values
## Age Gender OwnHome Married Location
## Middle:363 Female:357 Own :369 Married:369 Close:491
## Old :169 Male :340 Rent:328 Single :328 Far :206
## Young :165
##
##
##
## Salary Children History Catalogs
## Min. : 10100 Min. :0.000 High :255 Min. : 6.00
## 1st Qu.: 33200 1st Qu.:0.000 Low :230 1st Qu.:12.00
## Median : 55200 Median :1.000 Medium:212 Median :12.00
## Mean : 58055 Mean :0.901 Mean :15.11
## 3rd Qu.: 79900 3rd Qu.:2.000 3rd Qu.:18.00
## Max. :168800 Max. :3.000 Max. :24.00
## AmountSpent
## Min. : 38
## 1st Qu.: 472
## Median : 918
## Mean :1207
## 3rd Qu.:1702
## Max. :6217
Density Plots & Histograms
par(mfrow = c(2, 2))
# Full Data Set
plot(density(data.original$AmountSpent), main='Density Plot for Amount Spent', xlab='Amount Spent')
hist(data.original$AmountSpent, main = 'Histogram for Amount Spent', xlab='Amount Spent')
plot(density(data.original$Salary), main='Density Plot for Salary', xlab='Salary')
hist(data.original$Salary, main = 'Histogram for Salary', xlab='Salary')

# Reduced Data Set
plot(density(data.reduced$AmountSpent), main='Density Plot for Amount Spent', xlab='Amount Spent')
hist(data.reduced$AmountSpent, main = 'Histogram for Amount Spent', xlab='Amount Spent')
plot(density(data.reduced$Salary), main='Density Plot for Salary', xlab='Salary')
hist(data.reduced$Salary, main = 'Histogram for Salary', xlab='Salary')

Correlation Matrix
# Full dataset
numeric.variables.original<-cbind(data.original$Salary, data.original$Children, data.original$Catalogs, data.original$AmountSpent)
correlation.matrix.original<-cor(numeric.variables.original, numeric.variables.original)
rownames(correlation.matrix.original) <- c("Salary", "Children", "Catalogs", "Amount Spent")
colnames(correlation.matrix.original) <- c("Salary", "Children", "Catalogs", "Amount Spent")
print(correlation.matrix.original)
## Salary Children Catalogs Amount Spent
## Salary 1.00000000 0.04966316 0.1835509 0.6995957
## Children 0.04966316 1.00000000 -0.1134554 -0.2223082
## Catalogs 0.18355086 -0.11345543 1.0000000 0.4726499
## Amount Spent 0.69959571 -0.22230817 0.4726499 1.0000000
# Reduced dataset (removing missing values)
numeric.variables.reduced<-cbind(data.reduced$Salary, data.reduced$Children, data.reduced$Catalogs, data.reduced$AmountSpent)
correlation.matrix.reduced<-cor(numeric.variables.reduced, numeric.variables.reduced)
rownames(correlation.matrix.reduced) <- c("Salary", "Children", "Catalogs", "Amount Spent")
colnames(correlation.matrix.reduced) <- c("Salary", "Children", "Catalogs", "Amount Spent")
print(correlation.matrix.reduced)
## Salary Children Catalogs Amount Spent
## Salary 1.00000000 0.06557328 0.2280786 0.6638265
## Children 0.06557328 1.00000000 -0.1301653 -0.3221111
## Catalogs 0.22807856 -0.13016533 1.0000000 0.5125304
## Amount Spent 0.66382653 -0.32211111 0.5125304 1.0000000
Scatterplot Matrix
#Full dataset
pairs(~ AmountSpent+Salary+Children+Catalogs,data=data.original,
main="Simple Scatterplot Matrix")

#Reduced dataset (removing missing values)
pairs(~ AmountSpent+Salary+Children+Catalogs,data=data.reduced,
main="Simple Scatterplot Matrix")

Conditional Density Plots
categorical.variables.original<-c(data.original$Age, data.original$Gender, data.original$OwnHome, data.original$Married, data.original$Location, data.original$History)
par(mfrow = c(1, 2))
mycolor<-c("Blue", "Green","Red")
# Age
AmountSpent.Age <- factor(data.original$Age, levels= c("Young","Middle","Old"))
sm.density.compare(data.original$AmountSpent, data.original$Age, xlab='Age',col=mycolor) # plot densities
title(main="Density of Amount Spent by Age")
cdplot(factor(Age, levels=c("Young","Middle","Old"))~ AmountSpent, data=data.original, ylab='Age',col=mycolor) # ordered ages

# Gender
AmountSpent.Gender <- factor(data.original$Gender, levels= c("Female","Male"))
sm.density.compare(data.original$AmountSpent, data.original$Gender, xlab='Gender',col=mycolor) # plot densities
title(main="Density of Amount Spent by Gender")
cdplot(Gender ~ AmountSpent, data=data.original, col=mycolor)

# Own Home
AmountSpent.OwnHome <- factor(data.original$OwnHome, levels= c("Own","Rent"))
sm.density.compare(data.original$AmountSpent, data.original$OwnHome, xlab='OwnHome',col=mycolor) # plot densities
title(main="Density of Amount Spent by OwnHome")
cdplot(OwnHome~ AmountSpent, data=data.original, col=mycolor)

# Married
AmountSpent.Married <- factor(data.original$Married, levels= c("Married","Single"))
sm.density.compare(data.original$AmountSpent, data.original$Married, xlab='Married',col=mycolor) # plot densities
title(main="Density of Amount Spent by Married")
cdplot(Married~ AmountSpent, data=data.original, col=mycolor)

# Location
AmountSpent.Location <- factor(data.original$Location, levels= c("Close","Far"))
sm.density.compare(data.original$AmountSpent, data.original$Location, xlab='Location',col=mycolor) # plot densities
title(main="Density of Amount Spent by Location")
cdplot(Location~ AmountSpent, data=data.original, col=mycolor)

# History
AmountSpent.History <- factor(data.original$History, levels= c("Low","Medium","High"))
sm.density.compare(data.original$AmountSpent, data.original$History, main='Density of Amount Spent by History', xlab='History', col=mycolor) # plot densities
## missing data are removed
## missing data are removed
## missing data are removed
## missing data are removed
title(main="Density of Amount Spent by History")
cdplot(factor(History, levels=c("Low","Medium","High"))~ AmountSpent, data=data.original, ylab = 'History',col=mycolor) # ordered history

ANOVA tests to compare each Categorical Predictor Variable
## Age
par(mfrow = c(1, 1))
mycolor<-c("Blue", "Green","Red")
boxplot(AmountSpent ~ factor(Age, levels=c("Young","Middle","Old")),data=data.original,
main="Boxplots Comparing Age & Amount Spent", xlab="Age", ylab="Amount Spent", col=mycolor)

# split data into age categories: young/middle/old
data.young <- subset(data.original, data.original$Age=='Young')
data.middle <- subset(data.original, data.original$Age=='Middle')
data.old <- subset(data.original, data.original$Age=='Old')
# summary statistics
summary(data.young$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.0 248.5 422.0 558.6 699.0 3688.0
summary(data.middle$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 157.0 815.8 1320.0 1502.0 2008.0 5878.0
summary(data.old$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65 638 1120 1432 1985 6217
# test for equal variances
#var.test(data.young$AmountSpent,data.middle$AmountSpent) # unequal pop variances
#var.test(data.young$AmountSpent,data.old$AmountSpent) # unequal pop variances
#var.test(data.middle$AmountSpent,data.old$AmountSpent)
# One-way ANOVA F-test for equal pop means
oneway.test(data.original$AmountSpent ~ data.original$Age, var.equal = FALSE) # at least one pop mean is significantly different
##
## One-way analysis of means (not assuming equal variances)
##
## data: data.original$AmountSpent and data.original$Age
## F = 208.08, num df = 2.00, denom df = 477.07, p-value < 2.2e-16
## Gender
# display boxplot
boxplot(AmountSpent ~ factor(Gender, levels=c("Female","Male")),data=data.original,
main="Boxplots Comparing Gender & Amount Spent", xlab="Gender", ylab="Amount Spent", col=mycolor)

# split data into gender categories: female/male
data.female <- subset(data.original, data.original$Gender=='Female')
data.male <- subset(data.original, data.original$Gender=='Male')
# summary statistics
summary(data.female$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.0 364.2 706.0 1025.0 1442.0 5830.0
summary(data.male$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.0 688.8 1216.0 1413.0 1902.0 6217.0
# test for equal variances
#var.test(data.female$AmountSpent,data.male$AmountSpent) # equal pop variances
# One-way ANOVA F-test for equal pop means (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$Gender, var.equal = TRUE)
##
## One-way analysis of means
##
## data: data.original$AmountSpent and data.original$Gender
## F = 42.319, num df = 1, denom df = 998, p-value = 1.224e-10
## OwnHome
boxplot(AmountSpent ~ factor(OwnHome, levels=c("Rent","Own")),data=data.original,
main="Boxplots Comparing Home Ownership & Amount Spent", xlab="Home Ownership", ylab="Amount Spent", col=mycolor)

# split data into home ownership categories: rent/own
data.rent <- subset(data.original, data.original$OwnHome=='Rent')
data.own <- subset(data.original, data.original$OwnHome=='Own')
# summary statistics
summary(data.rent$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.0 327.5 623.0 868.8 1187.0 5830.0
summary(data.own$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65.0 751.8 1360.0 1543.0 2075.0 6217.0
# test for equal variances
#var.test(data.rent$AmountSpent,data.own$AmountSpent)
# unequal pop variances
# One-way ANOVA F-test for equal pop means (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$OwnHome, var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: data.original$AmountSpent and data.original$OwnHome
## F = 142.98, num df = 1.00, denom df = 934.01, p-value < 2.2e-16
## Married
boxplot(AmountSpent ~ factor(Married, levels=c("Married","Single")),data=data.original,
main="Boxplots Comparing Marriage & Amount Spent",xlab="Marriage", ylab="Amount Spent", col=mycolor)

# split data into marriage categories: married/single
data.married <- subset(data.original, data.original$Married=='Married')
data.single <- subset(data.original, data.original$Married=='Single')
# summary statistics
summary(data.married$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 93.0 864.8 1515.0 1672.0 2246.0 6217.0
summary(data.single$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.0 322.5 576.0 757.8 1011.0 4182.0
# test for equal variances
#var.test(data.married$AmountSpent,data.single$AmountSpent) # unequal pop variances
# One-way ANOVA F-test for equal pop means (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$Married, var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: data.original$AmountSpent and data.original$Married
## F = 293.37, num df = 1.00, denom df = 797.33, p-value < 2.2e-16
##Location
boxplot(AmountSpent ~ factor(Location, levels=c("Close","Far")),data=data.original,
main="Boxplots Comparing Location & Amount Spent",xlab="Location", ylab="Amount Spent", col=mycolor)

# split data into location categories: close/far
data.close <- subset(data.original, data.original$Location=='Close')
data.far <- subset(data.original, data.original$Location=='Far')
# summary statistics
summary(data.close$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.0 439.8 858.5 1062.0 1497.0 4984.0
summary(data.far$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 123 663 1317 1596 2260 6217
# test for equal variances
#var.test(data.close$AmountSpent,data.far$AmountSpent) # unequal pop variances
# One-way ANOVA F-test for equal pop means (equivalent to 2-sample t-test)
oneway.test(data.original$AmountSpent ~ data.original$Location, var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: data.original$AmountSpent and data.original$Location
## F = 50.185, num df = 1.00, denom df = 404.96, p-value = 6.239e-12
## History
boxplot(AmountSpent ~ factor(History, levels=c("Low","Medium","High","NA")),data=data.original,
main="Boxplots Comparing History & Amount Spent", xlab="History", ylab="Amount Spent", col=mycolor)

# split data into history categories: low/medium/high
data.low <- subset(data.original, data.original$History=='Low')
data.medium <- subset(data.original, data.original$History=='Medium')
data.high <- subset(data.original, data.original$History=='High')
data.NA <- subset(data.original, data.original$History=='NA')
# summary statistics
summary(data.low$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 38.0 210.0 305.5 357.1 471.5 1120.0
summary(data.medium$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 438.0 703.8 894.0 950.4 1149.0 2090.0
summary(data.high$AmountSpent)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 565 1522 1974 2186 2628 6217
# test for equal variances
#var.test(data.low$AmountSpent,data.medium$AmountSpent) # unequal pop variances
#var.test(data.low$AmountSpent,data.high$AmountSpent) # unequal pop variances
#var.test(data.medium$AmountSpent,data.high$AmountSpent) # unequal pop variances
# One-way ANOVA F-test for equal pop means
oneway.test(data.reduced$AmountSpent ~ data.reduced$History, var.equal = FALSE)
##
## One-way analysis of means (not assuming equal variances)
##
## data: data.reduced$AmountSpent and data.reduced$History
## F = 620.91, num df = 2.0, denom df = 399.7, p-value < 2.2e-16