#install packages
temp.packages <- c("car","gpairs","corrplot","psych","knitr","gplots")
temp.packages <- temp.packages[!(temp.packages %in% installed.packages()[,"Package"])]
if(length(temp.packages)>0) install.packages(temp.packages)
remove("temp.packages")
#graphic 設定
old.par = par(mfrow=c(1,1), mar=c(3, 3, 3, 3))
par(old.par)
#e (mathematical constant) 2.71828
1+sum(1/factorial(1:50))
## [1] 2.718282
log(2)
## [1] 0.6931472
exp(0.6931472)
## [1] 2
#got it?
###お客1000人のデータを作ります。
#my postal code Sumida-ku Osiage.
set.seed(1310045)
#the number of customers.s
ncust <- 1000
#customer unique id
cust.df <- data.frame(cust.id=factor(1:ncust))
#cutomer age
cust.df$age <- rnorm(n=ncust, mean=35, sd=5)
#信用格付
cust.df$credit.score <- rnorm(n=ncust, mean=3*cust.df$age+620, sd=50)
#メール有無は8:2に設定
cust.df$email <- factor(sample(c("yes", "no"), size=ncust, replace=TRUE, prob=c(0.8, 0.2)))
#同じ処理 = rlnorm(n=ncust, meanlog = 2 , sdlog = 1.2)
cust.df$distance.to.store <- exp(rnorm(n=ncust, mean=2, sd=1.2)) #sotreまでの距離
#distance.to.store(対数正規分布)
hist(cust.df$distance.to.store, main = "lognormal distribution", breaks = 30)
####オンライン訪問
cust.df$online.visits <- rnbinom(ncust, size=0.3,
mu = 15 + ifelse(cust.df$email=="yes", 15, 0) - 0.7 * (cust.df$age-median(cust.df$age)))
#An alternative parametrization (often used in ecology) is by the mean mu (see above), and size, the dispersion parameter, where prob = size/(size+mu). The variance is mu + mu^2/size in this parametrization.
#オンライントランザクション数
cust.df$online.trans <- rbinom(ncust, size=cust.df$online.visits, prob=0.3)
#オンライン売り上げ
cust.df$online.spend <- exp(rnorm(ncust, mean=3, sd=0.1)) * cust.df$online.trans
#店舗トランザクション数
cust.df$store.trans <- rnbinom(ncust, size=5, mu=3 / sqrt(cust.df$distance.to.store))
#店舗トランザクション売り上げ
cust.df$store.spend <- exp(rnorm(ncust, mean=3.5, sd=0.4)) * cust.df$store.trans
#summary(cust.df) or psych::describe(cust.df)
#満足度全て
sat.overall <- rnorm(ncust, mean=3.1, sd=0.7)
#サービス満足度調査結果(サービス&商品選定)
sat.service <- floor(sat.overall + rnorm(ncust, mean=0.5, sd=0.4))
sat.selection <- floor(sat.overall + rnorm(ncust, mean=-0.2, sd=0.6))
summary(cbind(sat.service, sat.selection))
## sat.service sat.selection
## Min. :0.000 Min. :-1.00
## 1st Qu.:3.000 1st Qu.: 2.00
## Median :3.000 Median : 2.00
## Mean :3.084 Mean : 2.39
## 3rd Qu.:4.000 3rd Qu.: 3.00
## Max. :5.000 Max. : 5.00
#[−1 to 6] -> [1 to 5]
#満足度は1から5
sat.service[sat.service > 5] <- 5
sat.service[sat.service < 1] <- 1
sat.selection[sat.selection > 5] <- 5
sat.selection[sat.selection < 1] <- 1
#summary(cbind(sat.service, sat.selection), digit=1)
#4.1.4 Simulating Non-Response Data
no.response <- as.logical(rbinom(ncust, size=1, prob=cust.df$age/100))
##NAは統計から除去してsummmary
sat.service[no.response] <- NA
sat.selection[no.response] <- NA
summary(cbind(sat.service, sat.selection))
## sat.service sat.selection
## Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000
## Median :3.000 Median :2.000
## Mean :3.118 Mean :2.464
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000
## NA's :339 NA's :339
cust.df$sat.service <- sat.service
cust.df$sat.selection <- sat.selection
summary(cust.df)
## cust.id age credit.score email
## 1 : 1 Min. :19.38 Min. :532.6 no :195
## 2 : 1 1st Qu.:31.11 1st Qu.:687.5 yes:805
## 3 : 1 Median :34.48 Median :726.9
## 4 : 1 Mean :34.59 Mean :723.3
## 5 : 1 3rd Qu.:37.97 3rd Qu.:756.9
## 6 : 1 Max. :51.63 Max. :869.6
## (Other):994
## distance.to.store online.visits online.trans online.spend
## Min. : 0.09843 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 3.39106 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 7.77187 Median : 6.00 Median : 2.000 Median : 38.92
## Mean : 15.51549 Mean : 28.92 Mean : 8.716 Mean : 177.72
## 3rd Qu.: 16.44356 3rd Qu.: 33.00 3rd Qu.: 10.000 3rd Qu.: 194.50
## Max. :287.90872 Max. :402.00 Max. :133.000 Max. :2960.72
##
## store.trans store.spend sat.service sat.selection
## Min. : 0.000 Min. : 0.00 Min. :1.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:3.000 1st Qu.:2.000
## Median : 1.000 Median : 30.07 Median :3.000 Median :2.000
## Mean : 1.339 Mean : 47.74 Mean :3.118 Mean :2.464
## 3rd Qu.: 2.000 3rd Qu.: 63.34 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :16.000 Max. :521.35 Max. :5.000 Max. :5.000
## NA's :339 NA's :339
#temp variablesを除去
rm(ncust, sat.overall, sat.service, sat.selection, no.response)
#お客データ構造確認
str(cust.df)
## 'data.frame': 1000 obs. of 12 variables:
## $ cust.id : Factor w/ 1000 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num 37.6 36.7 42.3 38.4 42.2 ...
## $ credit.score : num 633 734 751 725 745 ...
## $ email : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 1 1 2 2 ...
## $ distance.to.store: num 11.39 10.98 1.48 1.44 39.09 ...
## $ online.visits : num 29 0 14 41 31 26 7 12 55 1 ...
## $ online.trans : int 12 0 5 18 11 7 1 4 20 0 ...
## $ online.spend : num 229.3 0 97.5 333.7 241.7 ...
## $ store.trans : num 2 1 0 3 0 3 1 2 0 0 ...
## $ store.spend : num 79.6 33.8 0 117.7 0 ...
## $ sat.service : num NA NA 1 4 4 2 5 NA 4 3 ...
## $ sat.selection : num NA NA 1 4 1 2 3 NA 4 3 ...
#散布図(Scatter plot)
#年と信用格付の関係
plot(cust.df$age, cust.df$credit.score,
col="blue",
xlim=c(15, 55), ylim=c(500, 900),
main="Active Customers as of June 2014",
xlab="Customer Age (years)", ylab="Customer Credit Score ",
panel.first = grid(3, lty = 1, lwd = 2)
)
abline(h=mean(cust.df$credit.score), col="dark blue", lty="dotted", lw=3)
abline(v=mean(cust.df$age), col="dark blue", lty="dotted", lw=3)
# 1年間店舗とオンライン売り上げ比較
# !!! buy more online buy less in stores?
plot(cust.df$store.spend, cust.df$online.spend,
main="Customers as of June 2014",
xlab="Prior 12 months in-store sales ($)",
ylab="Prior 12 months online sales ($)",
cex=0.7)
#plotを呼ぶとき、実はデータタイプにより違うメッソードが呼ばれる。
methods(plot)[1:20]
## [1] "plot,allcategorical_missing_data.frame,binary-method"
## [2] "plot,allcategorical_missing_data.frame,categorical-method"
## [3] "plot,ANY,ANY-method"
## [4] "plot,color,ANY-method"
## [5] "plot,mi_list,ANY-method"
## [6] "plot,mi,ANY-method"
## [7] "plot,missing_data.frame,binary-method"
## [8] "plot,missing_data.frame,categorical-method"
## [9] "plot,missing_data.frame,missing_variable-method"
## [10] "plot,missing_data.frame,semi-continuous-method"
## [11] "plot,profile.mle,missing-method"
## [12] "plot,spam,missing-method"
## [13] "plot,spam,spam-method"
## [14] "plot,Spatial,missing-method"
## [15] "plot,SpatialGrid,missing-method"
## [16] "plot,SpatialLines,missing-method"
## [17] "plot,SpatialMultiPoints,missing-method"
## [18] "plot,SpatialPixels,missing-method"
## [19] "plot,SpatialPoints,missing-method"
## [20] "plot,SpatialPolygons,missing-method"
#店舗売り上げ
hist(cust.df$store.spend,
breaks=(0:ceiling(max(cust.df$store.spend)/10))*10,
main="Customers as of June 2014",
xlab="Prior 12 months online sales ($)",
ylab="Count of customers",
col = rainbow(30)
)
#色の設定とプロットポイントタイプ設定
my.col <- c("black", "green3")
my.pch <- c(1, 19) # R's symbols for solid and open circles (see ?points)
head(cust.df$email)
## [1] yes no yes yes yes yes
## Levels: no yes
as.numeric(head(cust.df$email))
## [1] 2 1 2 2 2 2
my.col[as.numeric(head(cust.df$email))]
## [1] "green3" "black" "green3" "green3" "green3" "green3"
my.col[head(cust.df$email)]
## [1] "green3" "black" "green3" "green3" "green3" "green3"
#レイアウト設定
layout(mat = matrix(1:2, ncol=2))
par("mar"=c(2, 2, 2, 2), pty="s")
#売り上げ(オンライン,店舗) with email有無
#not bad
plot(cust.df$store.spend, cust.df$online.spend,col=as.numeric(cust.df$email), main="Customers as of June 2014")
#better
plot(cust.df$store.spend, cust.df$online.spend,
cex=0.7,
col=my.col[cust.df$email], pch=my.pch[cust.df$email],
main="Customers as of June 2014",
xlab="Prior 12 months in-store sales ($)",
ylab="Prior 12 months online sales ($)" )
legend(x="topright", legend=paste("email on file:", levels(cust.df$email)),
col=my.col, pch=my.pch)
# Multi-panel plots
par(mfrow=c(2, 2), mar=c(2, 2, 2, 2), par(mfrow=c(1,2), pty="s"))
#or layout(mat = matrix(1:4, ncol=2, nrow=2, byrow=TRUE))
#Difficult to see
plot(cust.df$distance.to.store, cust.df$store.spend, main="store")
plot(cust.df$distance.to.store, cust.df$online.spend, main="online")
#Much better
#!!!spend + 1 to avoid an error due to the fact that log(0) is not defined.
#https://www.quora.com/If-0-e-negative-of-infinity-why-is-log-0-not-equal-to-negative-of-infinity-Why-is-it-undefined
plot(cust.df$distance.to.store, cust.df$store.spend+1, log="xy", main="store, log")
plot(cust.df$distance.to.store, cust.df$online.spend+1, log="xy", main="online, log")
#Log(vector) vs paramter log="xy" (see axis)
#log="xy" is better if you want to keep original axis
par(mfrow=c(1,2))
plot(log(cust.df$distance.to.store), log(cust.df$store.spend+1), main="store, log")
plot(cust.df$distance.to.store, cust.df$store.spend+1, log="xy", main="store, log")
par = old.par
#it is good practice to examine scatterplots between all pairs of variables
pairs(formula = ~ age + credit.score + email +
distance.to.store + online.visits + online.trans +
online.spend + store.trans + store.spend,
data=cust.df)
#same as above
#pairs(cust.df[ , c(2:10)])
#with package car::
#car stands for Companion to Applied Rgression
car::scatterplotMatrix(formula = ~ age + credit.score + email +
distance.to.store + online.visits + online.trans +
online.spend + store.trans + store.spend,
data=cust.df, diagonal="histogram")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
#with package gpairs:: more features
##1.including adding smoothed lines on scatterplots
##2.univariate histograms on the diagonal
gpairs::gpairs(cust.df[ , c(2:10)])
#factored email vs email not factored
#!!!why it is useful to set variable types appropriately.
gpairs::gpairs(cbind(as.numeric(cust.df$email), cust.df[,6:8]))#boxplot
gpairs::gpairs(cbind(cust.df$email, cust.df[,6:8]))#scatterplots instead of boxplots by factor
#covariance
cov(cust.df$age, cust.df$credit.score)
## [1] 81.83151
#Correlation
cor(cust.df$age, cust.df$credit.score)
## [1] 0.3129197
lm(cust.df$age ~ cust.df$credit.score)
##
## Call:
## lm(formula = cust.df$age ~ cust.df$credit.score)
##
## Coefficients:
## (Intercept) cust.df$credit.score
## 12.53541 0.03049
#same???
#cov(cust.df$age, cust.df$credit.score)/(sd(cust.df$age)*sd(cust.df$credit.score))
# -1 for perfect negative linear association
# 0 or no linear association,
# 1 for perfect positive linear association
#We often use Cohen’s Rules of Thumb
##r = 0.1 should be considered a small or weak association,
##r = 0.3 might be considered to be medium in strength
##r = 0.5 or higher could be considered to be large or strong
cor.test(cust.df$age, cust.df$credit.score, conf.level = 0.95)
##
## Pearson's product-moment correlation
##
## data: cust.df$age and cust.df$credit.score
## t = 10.408, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2558906 0.3677783
## sample estimates:
## cor
## 0.3129197
# Correlation matrix
cor(cust.df[, c(2, 3, 5:12)])
## age credit.score distance.to.store
## age 1.000000000 0.312919709 -0.02863940
## credit.score 0.312919709 1.000000000 -0.05361713
## distance.to.store -0.028639405 -0.053617131 1.00000000
## online.visits -0.095910511 -0.011404665 0.08625511
## online.trans -0.089246699 -0.003379644 0.07470580
## online.spend -0.088779653 -0.002360410 0.07633296
## store.trans -0.004106361 -0.025210298 -0.24943509
## store.spend -0.006920703 -0.006115533 -0.22446447
## sat.service NA NA NA
## sat.selection NA NA NA
## online.visits online.trans online.spend store.trans
## age -0.09591051 -0.089246699 -0.08877965 -0.004106361
## credit.score -0.01140466 -0.003379644 -0.00236041 -0.025210298
## distance.to.store 0.08625511 0.074705801 0.07633296 -0.249435091
## online.visits 1.00000000 0.989852295 0.98427401 -0.013082898
## online.trans 0.98985229 1.000000000 0.99458198 -0.013386059
## online.spend 0.98427401 0.994581977 1.00000000 -0.011821916
## store.trans -0.01308290 -0.013386059 -0.01182192 1.000000000
## store.spend -0.03490119 -0.034858395 -0.03270388 0.873763479
## sat.service NA NA NA NA
## sat.selection NA NA NA NA
## store.spend sat.service sat.selection
## age -0.006920703 NA NA
## credit.score -0.006115533 NA NA
## distance.to.store -0.224464468 NA NA
## online.visits -0.034901186 NA NA
## online.trans -0.034858395 NA NA
## online.spend -0.032703881 NA NA
## store.trans 0.873763479 NA NA
## store.spend 1.000000000 NA NA
## sat.service NA 1 NA
## sat.selection NA NA 1
#NAを除いて、計算
cor(cust.df[,5:12], use="complete.obs")
## distance.to.store online.visits online.trans
## distance.to.store 1.000000000 0.13443157 0.12146889
## online.visits 0.134431574 1.00000000 0.98932218
## online.trans 0.121468887 0.98932218 1.00000000
## online.spend 0.122908148 0.98285450 0.99391990
## store.trans -0.258032833 -0.02700258 -0.02995585
## store.spend -0.231304529 -0.04001448 -0.04199695
## sat.service 0.040836076 -0.10761237 -0.09916826
## sat.selection -0.007265957 -0.09277910 -0.08963592
## online.spend store.trans store.spend sat.service
## distance.to.store 0.12290815 -0.258032833 -0.23130453 0.040836076
## online.visits 0.98285450 -0.027002580 -0.04001448 -0.107612373
## online.trans 0.99391990 -0.029955850 -0.04199695 -0.099168259
## online.spend 1.00000000 -0.031757000 -0.04333783 -0.098655669
## store.trans -0.03175700 1.000000000 0.86340766 -0.004712993
## store.spend -0.04333783 0.863407657 1.00000000 0.010855703
## sat.service -0.09865567 -0.004712993 0.01085570 1.000000000
## sat.selection -0.08645448 -0.032659901 -0.02474723 0.593774025
## sat.selection
## distance.to.store -0.007265957
## online.visits -0.092779102
## online.trans -0.089635924
## online.spend -0.086454482
## store.trans -0.032659901
## store.spend -0.024747226
## sat.service 0.593774025
## sat.selection 1.000000000
#correlation matrixの視覚化
corrplot::corrplot.mixed(corr=cor(cust.df[ , c(2, 3, 5:12)], use="complete.obs"),
upper="ellipse", tl.pos="lt",
col = gplots::colorpanel(50, "red", "gray60", "blue4"))
#4.5.3 Transforming Variables before Computing Correlations
#Correlationsを求める前に、正規化作業をする。
x <- runif(1000, min=-10, max=10)
#r is near zero despite the fact that there is a perfect nonlinear relationship between x and x2.
#but, we get a correlation close to zero:
cor(x, x^2)
## [1] -0.000229076
plot(x,x^2)
plot(cust.df$distance.to.store, cust.df$store.spend)
cor(1/cust.df$distance.to.store, cust.df$store.spend)
## [1] 0.4176922
plot(1/cust.df$distance.to.store, cust.df$store.spend)
cor(1/sqrt(cust.df$distance.to.store), cust.df$store.spend)
## [1] 0.4934285
# see note above about doing "Clear All" plots in RStudio if plot is odd
#
par(mfrow=c(1,2), pty="s") #pty=s(plot type to be squre)
plot(cust.df$distance.to.store, cust.df$store.spend)
plot(1/sqrt(cust.df$distance.to.store), cust.df$store.spend)
car::powerTransform(cust.df$distance.to.store)
## Estimated transformation parameters
## cust.df$distance.to.store
## 0.0007493118
lambda <- coef(powerTransform(1/cust.df$distance.to.store))
par(mfrow=c(1,2))
hist(cust.df$distance.to.store,
xlab="Distance to Nearest Store", ylab="Count of Customers",
main="Original Distribution")
hist(bcPower(cust.df$distance.to.store, lambda),
xlab="Box-Cox Transform of Distance", ylab="Count of Customers",
main="Transformed Distribution")
#??
car::powerTransform(cust.df$age)
## Estimated transformation parameters
## cust.df$age
## 0.8718129
l.dist <- coef(powerTransform(cust.df$distance.to.store))
l.spend <- coef(powerTransform(cust.df$store.spend+1))
cor(bcPower(cust.df$distance.to.store, l.dist),
bcPower(cust.df$store.spend+1, l.spend))
## [1] -0.4524675
plot(cust.df\(sat.service, cust.df\)sat.selection, xlab=“Customer Satisfaction with Service”, ylab=“Customer Satisfaction with Selection”, main=“Customers as of June 2014”)
plot(jitter(cust.df\(sat.service), jitter(cust.df\)sat.selection), xlab=“Customer Satisfaction with Service”, ylab=“Customer Satisfaction with Selection”, main=“Customers as of June 2014”)
resp <- !is.na(cust.df\(sat.service) cor(cust.df\)sat.service[resp], cust.df$sat.selection[resp])
library(psych) psych::polycho ric(cbind(cust.df\(sat.service[resp],cust.df\)sat.selection[resp]))
This file contains scripts used in Chapter 4 of Chapman & Feit (2015), “R for Marketing Research and Analytics”, Springer. Copyright 2015, Springer Copyright 2015, humogrammer #################################################################