Chapter 4 Relationships Between Continuous Variables

4-0 packagesインストール& graphic設定

#install packages
temp.packages <- c("car","gpairs","corrplot","psych","knitr","gplots")
temp.packages <- temp.packages[!(temp.packages %in% installed.packages()[,"Package"])]
if(length(temp.packages)>0) install.packages(temp.packages)
remove("temp.packages")

#graphic 設定
old.par = par(mfrow=c(1,1), mar=c(3, 3, 3, 3))
par(old.par)

4-1 Retailer Data

#e (mathematical constant) 2.71828 
1+sum(1/factorial(1:50))

## [1] 2.718282

log(2)

## [1] 0.6931472

exp(0.6931472)

## [1] 2

#got it?

###お客1000人のデータを作ります。
#my postal code Sumida-ku Osiage.
set.seed(1310045)

#the number of customers.s
ncust <- 1000 

#customer unique id
cust.df <- data.frame(cust.id=factor(1:ncust)) 

#cutomer age
cust.df$age <- rnorm(n=ncust, mean=35, sd=5)

#信用格付
cust.df$credit.score <- rnorm(n=ncust, mean=3*cust.df$age+620, sd=50) 

#メール有無は8:2に設定
cust.df$email <- factor(sample(c("yes", "no"), size=ncust, replace=TRUE, prob=c(0.8, 0.2)))

#同じ処理 = rlnorm(n=ncust, meanlog = 2 , sdlog = 1.2)
cust.df$distance.to.store <- exp(rnorm(n=ncust, mean=2, sd=1.2)) #sotreまでの距離

#distance.to.store(対数正規分布)
hist(cust.df$distance.to.store, main = "lognormal distribution", breaks = 30)

####オンライン訪問
cust.df$online.visits <- rnbinom(ncust, size=0.3, 
                          mu = 15 + ifelse(cust.df$email=="yes", 15, 0) - 0.7 * (cust.df$age-median(cust.df$age))) 
#An alternative parametrization (often used in ecology) is by the mean mu (see above), and size, the dispersion parameter, where prob = size/(size+mu). The variance is mu + mu^2/size in this parametrization.

#オンライントランザクション数
cust.df$online.trans <- rbinom(ncust, size=cust.df$online.visits, prob=0.3)

#オンライン売り上げ
cust.df$online.spend <- exp(rnorm(ncust, mean=3, sd=0.1)) * cust.df$online.trans

#店舗トランザクション数
cust.df$store.trans <- rnbinom(ncust, size=5, mu=3 / sqrt(cust.df$distance.to.store))
#店舗トランザクション売り上げ
cust.df$store.spend <- exp(rnorm(ncust, mean=3.5, sd=0.4)) * cust.df$store.trans
#summary(cust.df) or psych::describe(cust.df)

#満足度全て
sat.overall <- rnorm(ncust, mean=3.1, sd=0.7)

#サービス満足度調査結果(サービス&商品選定)
sat.service <- floor(sat.overall + rnorm(ncust, mean=0.5, sd=0.4))
sat.selection <- floor(sat.overall + rnorm(ncust, mean=-0.2, sd=0.6))
summary(cbind(sat.service, sat.selection))

##   sat.service    sat.selection  
##  Min.   :0.000   Min.   :-1.00  
##  1st Qu.:3.000   1st Qu.: 2.00  
##  Median :3.000   Median : 2.00  
##  Mean   :3.084   Mean   : 2.39  
##  3rd Qu.:4.000   3rd Qu.: 3.00  
##  Max.   :5.000   Max.   : 5.00

#[−1 to 6] -> [1 to 5]  
#満足度は1から5
sat.service[sat.service > 5] <- 5
sat.service[sat.service < 1] <- 1
sat.selection[sat.selection > 5] <- 5
sat.selection[sat.selection < 1] <- 1
#summary(cbind(sat.service, sat.selection), digit=1)


#4.1.4 Simulating Non-Response Data
no.response <- as.logical(rbinom(ncust, size=1, prob=cust.df$age/100))

##NAは統計から除去してsummmary
sat.service[no.response] <- NA
sat.selection[no.response] <- NA

summary(cbind(sat.service, sat.selection))

##   sat.service    sat.selection  
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000  
##  Median :3.000   Median :2.000  
##  Mean   :3.118   Mean   :2.464  
##  3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000  
##  NA's   :339     NA's   :339

cust.df$sat.service <- sat.service
cust.df$sat.selection <- sat.selection
summary(cust.df)

##     cust.id         age         credit.score   email    
##  1      :  1   Min.   :19.38   Min.   :532.6   no :195  
##  2      :  1   1st Qu.:31.11   1st Qu.:687.5   yes:805  
##  3      :  1   Median :34.48   Median :726.9            
##  4      :  1   Mean   :34.59   Mean   :723.3            
##  5      :  1   3rd Qu.:37.97   3rd Qu.:756.9            
##  6      :  1   Max.   :51.63   Max.   :869.6            
##  (Other):994                                            
##  distance.to.store   online.visits     online.trans      online.spend    
##  Min.   :  0.09843   Min.   :  0.00   Min.   :  0.000   Min.   :   0.00  
##  1st Qu.:  3.39106   1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:   0.00  
##  Median :  7.77187   Median :  6.00   Median :  2.000   Median :  38.92  
##  Mean   : 15.51549   Mean   : 28.92   Mean   :  8.716   Mean   : 177.72  
##  3rd Qu.: 16.44356   3rd Qu.: 33.00   3rd Qu.: 10.000   3rd Qu.: 194.50  
##  Max.   :287.90872   Max.   :402.00   Max.   :133.000   Max.   :2960.72  
##                                                                          
##   store.trans      store.spend      sat.service    sat.selection  
##  Min.   : 0.000   Min.   :  0.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 0.000   1st Qu.:  0.00   1st Qu.:3.000   1st Qu.:2.000  
##  Median : 1.000   Median : 30.07   Median :3.000   Median :2.000  
##  Mean   : 1.339   Mean   : 47.74   Mean   :3.118   Mean   :2.464  
##  3rd Qu.: 2.000   3rd Qu.: 63.34   3rd Qu.:4.000   3rd Qu.:3.000  
##  Max.   :16.000   Max.   :521.35   Max.   :5.000   Max.   :5.000  
##                                    NA's   :339     NA's   :339

#temp variablesを除去
rm(ncust, sat.overall, sat.service, sat.selection, no.response)

#お客データ構造確認
str(cust.df)

## 'data.frame':    1000 obs. of  12 variables:
##  $ cust.id          : Factor w/ 1000 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ age              : num  37.6 36.7 42.3 38.4 42.2 ...
##  $ credit.score     : num  633 734 751 725 745 ...
##  $ email            : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 1 1 2 2 ...
##  $ distance.to.store: num  11.39 10.98 1.48 1.44 39.09 ...
##  $ online.visits    : num  29 0 14 41 31 26 7 12 55 1 ...
##  $ online.trans     : int  12 0 5 18 11 7 1 4 20 0 ...
##  $ online.spend     : num  229.3 0 97.5 333.7 241.7 ...
##  $ store.trans      : num  2 1 0 3 0 3 1 2 0 0 ...
##  $ store.spend      : num  79.6 33.8 0 117.7 0 ...
##  $ sat.service      : num  NA NA 1 4 4 2 5 NA 4 3 ...
##  $ sat.selection    : num  NA NA 1 4 1 2 3 NA 4 3 ...

4-2 Exploring Associations Between Variables with Scatterplots

#散布図(Scatter plot)
#年と信用格付の関係
plot(cust.df$age, cust.df$credit.score, 
     col="blue",
     xlim=c(15, 55), ylim=c(500, 900), 
     main="Active Customers as of June 2014",
     xlab="Customer Age (years)", ylab="Customer Credit Score ",
     panel.first = grid(3, lty = 1, lwd = 2)
     )
abline(h=mean(cust.df$credit.score), col="dark blue", lty="dotted", lw=3)
abline(v=mean(cust.df$age), col="dark blue", lty="dotted", lw=3)

# １年間店舗とオンライン売り上げ比較
# !!! buy more online buy less in stores?
plot(cust.df$store.spend, cust.df$online.spend, 
     main="Customers as of June 2014", 
     xlab="Prior 12 months in-store sales ($)", 
     ylab="Prior 12 months online sales ($)", 
     cex=0.7)

#plotを呼ぶとき、実はデータタイプにより違うメッソードが呼ばれる。
methods(plot)[1:20]

##  [1] "plot,allcategorical_missing_data.frame,binary-method"     
##  [2] "plot,allcategorical_missing_data.frame,categorical-method"
##  [3] "plot,ANY,ANY-method"                                      
##  [4] "plot,color,ANY-method"                                    
##  [5] "plot,mi_list,ANY-method"                                  
##  [6] "plot,mi,ANY-method"                                       
##  [7] "plot,missing_data.frame,binary-method"                    
##  [8] "plot,missing_data.frame,categorical-method"               
##  [9] "plot,missing_data.frame,missing_variable-method"          
## [10] "plot,missing_data.frame,semi-continuous-method"           
## [11] "plot,profile.mle,missing-method"                          
## [12] "plot,spam,missing-method"                                 
## [13] "plot,spam,spam-method"                                    
## [14] "plot,Spatial,missing-method"                              
## [15] "plot,SpatialGrid,missing-method"                          
## [16] "plot,SpatialLines,missing-method"                         
## [17] "plot,SpatialMultiPoints,missing-method"                   
## [18] "plot,SpatialPixels,missing-method"                        
## [19] "plot,SpatialPoints,missing-method"                        
## [20] "plot,SpatialPolygons,missing-method"

#店舗売り上げ
hist(cust.df$store.spend, 
     breaks=(0:ceiling(max(cust.df$store.spend)/10))*10,
     main="Customers as of June 2014", 
     xlab="Prior 12 months online sales ($)", 
     ylab="Count of customers",
     col = rainbow(30)
)

#色の設定とプロットポイントタイプ設定
my.col <- c("black", "green3")
my.pch <- c(1, 19) # R's symbols for solid and open circles (see ?points)
head(cust.df$email)

## [1] yes no  yes yes yes yes
## Levels: no yes

as.numeric(head(cust.df$email))

## [1] 2 1 2 2 2 2

my.col[as.numeric(head(cust.df$email))]

## [1] "green3" "black"  "green3" "green3" "green3" "green3"

my.col[head(cust.df$email)]

## [1] "green3" "black"  "green3" "green3" "green3" "green3"

#レイアウト設定
layout(mat = matrix(1:2, ncol=2))
par("mar"=c(2, 2, 2, 2),  pty="s")

#売り上げ(オンライン,店舗) with email有無

#not bad
plot(cust.df$store.spend, cust.df$online.spend,col=as.numeric(cust.df$email), main="Customers as of June 2014")
#better
plot(cust.df$store.spend, cust.df$online.spend,
     cex=0.7,
     col=my.col[cust.df$email], pch=my.pch[cust.df$email], 
     main="Customers as of June 2014", 
     xlab="Prior 12 months in-store sales ($)", 
     ylab="Prior 12 months online sales ($)" )
legend(x="topright", legend=paste("email on file:", levels(cust.df$email)), 
       col=my.col, pch=my.pch)

4.3 Combining Plots in a Single Graphics Object

# Multi-panel plots
par(mfrow=c(2, 2), mar=c(2, 2, 2, 2), par(mfrow=c(1,2), pty="s")) 
#or layout(mat = matrix(1:4, ncol=2, nrow=2, byrow=TRUE))

#Difficult to see
plot(cust.df$distance.to.store, cust.df$store.spend, main="store")
plot(cust.df$distance.to.store, cust.df$online.spend, main="online")

#Much better
#!!!spend + 1 to avoid an error due to the fact that log(0) is not defined.
#https://www.quora.com/If-0-e-negative-of-infinity-why-is-log-0-not-equal-to-negative-of-infinity-Why-is-it-undefined
plot(cust.df$distance.to.store, cust.df$store.spend+1, log="xy", main="store, log")
plot(cust.df$distance.to.store, cust.df$online.spend+1, log="xy", main="online, log")

#Log(vector) vs paramter log="xy"  (see axis) 
#log="xy" is better if you want to keep original axis
par(mfrow=c(1,2))
plot(log(cust.df$distance.to.store), log(cust.df$store.spend+1), main="store, log")
plot(cust.df$distance.to.store, cust.df$store.spend+1, log="xy", main="store, log")

par = old.par

4.4 Scatterplot Matrices

#it is good practice to examine scatterplots between all pairs of variables
pairs(formula = ~ age + credit.score + email +
                  distance.to.store + online.visits + online.trans + 
                  online.spend + store.trans + store.spend,
                  data=cust.df)

#same as above
#pairs(cust.df[ , c(2:10)])

#with package car::
#car stands for Companion to Applied Rgression
car::scatterplotMatrix(formula = ~ age + credit.score + email +
                    distance.to.store + online.visits + online.trans + 
                    online.spend + store.trans + store.spend, 
                  data=cust.df, diagonal="histogram")

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth

#with package gpairs:: more features
##1.including adding smoothed lines on scatterplots
##2.univariate histograms on the diagonal
gpairs::gpairs(cust.df[ , c(2:10)])

#factored email vs email not factored
#!!!why it is useful to set variable types appropriately.
gpairs::gpairs(cbind(as.numeric(cust.df$email), cust.df[,6:8]))#boxplot

gpairs::gpairs(cbind(cust.df$email, cust.df[,6:8]))#scatterplots instead of boxplots by factor

4.5 Correlation Coefficients

正規分布されたことを前提で正しい係数が出る

a log- arithmic or other distribution that is skewed or strongly non-normal in shape, then these thresholds do not apply. In those cases, it can be helpful to transform your variables to normal distributions before interpreting,

#covariance
cov(cust.df$age, cust.df$credit.score)

## [1] 81.83151

#Correlation
cor(cust.df$age, cust.df$credit.score)

## [1] 0.3129197

lm(cust.df$age ~ cust.df$credit.score)

## 
## Call:
## lm(formula = cust.df$age ~ cust.df$credit.score)
## 
## Coefficients:
##          (Intercept)  cust.df$credit.score  
##             12.53541               0.03049

#same???
#cov(cust.df$age, cust.df$credit.score)/(sd(cust.df$age)*sd(cust.df$credit.score)) 

# -1 for perfect negative linear association
#  0 or no linear association,
#  1 for perfect positive linear association
#We often use Cohen’s Rules of Thumb
##r = 0.1 should be considered a small or weak association, 
##r = 0.3 might be considered to be medium in strength
##r = 0.5 or higher could be considered to be large or strong
cor.test(cust.df$age, cust.df$credit.score, conf.level = 0.95)

## 
##  Pearson's product-moment correlation
## 
## data:  cust.df$age and cust.df$credit.score
## t = 10.408, df = 998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2558906 0.3677783
## sample estimates:
##       cor 
## 0.3129197

# Correlation matrix 
cor(cust.df[, c(2, 3, 5:12)])

##                            age credit.score distance.to.store
## age                1.000000000  0.312919709       -0.02863940
## credit.score       0.312919709  1.000000000       -0.05361713
## distance.to.store -0.028639405 -0.053617131        1.00000000
## online.visits     -0.095910511 -0.011404665        0.08625511
## online.trans      -0.089246699 -0.003379644        0.07470580
## online.spend      -0.088779653 -0.002360410        0.07633296
## store.trans       -0.004106361 -0.025210298       -0.24943509
## store.spend       -0.006920703 -0.006115533       -0.22446447
## sat.service                 NA           NA                NA
## sat.selection               NA           NA                NA
##                   online.visits online.trans online.spend  store.trans
## age                 -0.09591051 -0.089246699  -0.08877965 -0.004106361
## credit.score        -0.01140466 -0.003379644  -0.00236041 -0.025210298
## distance.to.store    0.08625511  0.074705801   0.07633296 -0.249435091
## online.visits        1.00000000  0.989852295   0.98427401 -0.013082898
## online.trans         0.98985229  1.000000000   0.99458198 -0.013386059
## online.spend         0.98427401  0.994581977   1.00000000 -0.011821916
## store.trans         -0.01308290 -0.013386059  -0.01182192  1.000000000
## store.spend         -0.03490119 -0.034858395  -0.03270388  0.873763479
## sat.service                  NA           NA           NA           NA
## sat.selection                NA           NA           NA           NA
##                    store.spend sat.service sat.selection
## age               -0.006920703          NA            NA
## credit.score      -0.006115533          NA            NA
## distance.to.store -0.224464468          NA            NA
## online.visits     -0.034901186          NA            NA
## online.trans      -0.034858395          NA            NA
## online.spend      -0.032703881          NA            NA
## store.trans        0.873763479          NA            NA
## store.spend        1.000000000          NA            NA
## sat.service                 NA           1            NA
## sat.selection               NA          NA             1

#NAを除いて、計算
cor(cust.df[,5:12], use="complete.obs")

##                   distance.to.store online.visits online.trans
## distance.to.store       1.000000000    0.13443157   0.12146889
## online.visits           0.134431574    1.00000000   0.98932218
## online.trans            0.121468887    0.98932218   1.00000000
## online.spend            0.122908148    0.98285450   0.99391990
## store.trans            -0.258032833   -0.02700258  -0.02995585
## store.spend            -0.231304529   -0.04001448  -0.04199695
## sat.service             0.040836076   -0.10761237  -0.09916826
## sat.selection          -0.007265957   -0.09277910  -0.08963592
##                   online.spend  store.trans store.spend  sat.service
## distance.to.store   0.12290815 -0.258032833 -0.23130453  0.040836076
## online.visits       0.98285450 -0.027002580 -0.04001448 -0.107612373
## online.trans        0.99391990 -0.029955850 -0.04199695 -0.099168259
## online.spend        1.00000000 -0.031757000 -0.04333783 -0.098655669
## store.trans        -0.03175700  1.000000000  0.86340766 -0.004712993
## store.spend        -0.04333783  0.863407657  1.00000000  0.010855703
## sat.service        -0.09865567 -0.004712993  0.01085570  1.000000000
## sat.selection      -0.08645448 -0.032659901 -0.02474723  0.593774025
##                   sat.selection
## distance.to.store  -0.007265957
## online.visits      -0.092779102
## online.trans       -0.089635924
## online.spend       -0.086454482
## store.trans        -0.032659901
## store.spend        -0.024747226
## sat.service         0.593774025
## sat.selection       1.000000000

#correlation matrixの視覚化
corrplot::corrplot.mixed(corr=cor(cust.df[ , c(2, 3, 5:12)], use="complete.obs"), 
               upper="ellipse", tl.pos="lt", 
               col = gplots::colorpanel(50, "red", "gray60", "blue4"))

#4.5.3 Transforming Variables before Computing Correlations
#Correlationsを求める前に、正規化作業をする。
x <- runif(1000, min=-10, max=10)
#r is near zero despite the fact that there is a perfect nonlinear relationship between x and x2. 
#but, we get a correlation close to zero:
cor(x, x^2)

## [1] -0.000229076

plot(x,x^2)

plot(cust.df$distance.to.store, cust.df$store.spend)

cor(1/cust.df$distance.to.store, cust.df$store.spend)

## [1] 0.4176922

plot(1/cust.df$distance.to.store, cust.df$store.spend)

cor(1/sqrt(cust.df$distance.to.store), cust.df$store.spend)

## [1] 0.4934285

# see note above about doing "Clear All" plots in RStudio if plot is odd
#
par(mfrow=c(1,2), pty="s") #pty=s(plot type to be squre)
plot(cust.df$distance.to.store, cust.df$store.spend)
plot(1/sqrt(cust.df$distance.to.store), cust.df$store.spend)

car::powerTransform(cust.df$distance.to.store)

## Estimated transformation parameters 
## cust.df$distance.to.store 
##              0.0007493118

lambda <- coef(powerTransform(1/cust.df$distance.to.store))

par(mfrow=c(1,2))
hist(cust.df$distance.to.store, 
     xlab="Distance to Nearest Store", ylab="Count of Customers", 
     main="Original Distribution")
hist(bcPower(cust.df$distance.to.store, lambda),
     xlab="Box-Cox Transform of Distance", ylab="Count of Customers", 
     main="Transformed Distribution")

#??
car::powerTransform(cust.df$age)

## Estimated transformation parameters 
## cust.df$age 
##   0.8718129

l.dist  <- coef(powerTransform(cust.df$distance.to.store))
l.spend <- coef(powerTransform(cust.df$store.spend+1))

cor(bcPower(cust.df$distance.to.store, l.dist), 
    bcPower(cust.df$store.spend+1, l.spend))

## [1] -0.4524675

Chapter 4 Relationships Between Continuous Variables

4-0 packagesインストール& graphic設定

4-1 Retailer Data

4-2 Exploring Associations Between Variables with Scatterplots

4.3 Combining Plots in a Single Graphics Object

4.4 Scatterplot Matrices

4.5 Correlation Coefficients

正規分布されたことを前提で正しい係数が出る

a log- arithmic or other distribution that is skewed or strongly non-normal in shape, then these thresholds do not apply. In those cases, it can be helpful to transform your variables to normal distributions before interpreting,

Polychoric correlations