Chapter 3 Describing Data

データを作る
Discrete Variables
Continuous Variables
apply()
single variable visualization
world map
Reference

データを作る

k.stores <- 20    # 20 stores
k.weeks <- 104    # 2 years of data each

store.df <- data.frame(matrix(NA, ncol=10, nrow=k.stores*k.weeks))
names(store.df) <- c("storeNum", "Year", "Week", "p1sales", "p2sales", 
                     "p1price", "p2price", "p1prom", "p2prom", "country")
##次元(dimension)確認
dim(store.df)

## [1] 2080   10

#ストア番号は101から、　120まで
store.num <- 101:(100+k.stores)

#国設定
(store.cty <- c(rep("US", 3), rep("DE", 5), rep("GB", 3), rep("BR", 2),
                rep("JP", 4), rep("AU", 1), rep("CN", 2)))

##  [1] "US" "US" "US" "DE" "DE" "DE" "DE" "DE" "GB" "GB" "GB" "BR" "BR" "JP"
## [15] "JP" "JP" "JP" "AU" "CN" "CN"

length(store.cty)    # make sure the country list is the right length

## [1] 20

store.df$storeNum <- rep(store.num, each=k.weeks)
store.df$country  <- rep(store.cty, each=k.weeks)

# clean up、使った変数はコミ箱に!!
rm(store.num, store.cty)

store.df$Week <- rep(1:52, times=k.stores*2)

# try the inner parts of the next line to figure out how we use rep()
test <- (store.df$Year  <- rep(rep(1:2, each=k.weeks/2), times=k.stores))
str(store.df)

## 'data.frame':    2080 obs. of  10 variables:
##  $ storeNum: int  101 101 101 101 101 101 101 101 101 101 ...
##  $ Year    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Week    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ p1sales : logi  NA NA NA NA NA NA ...
##  $ p2sales : logi  NA NA NA NA NA NA ...
##  $ p1price : logi  NA NA NA NA NA NA ...
##  $ p2price : logi  NA NA NA NA NA NA ...
##  $ p1prom  : logi  NA NA NA NA NA NA ...
##  $ p2prom  : logi  NA NA NA NA NA NA ...
##  $ country : chr  "US" "US" "US" "US" ...

#facotr化する
store.df$storeNum <- factor(store.df$storeNum)
store.df$country  <- factor(store.df$country)

#内容確認
str(store.df)

## 'data.frame':    2080 obs. of  10 variables:
##  $ storeNum: Factor w/ 20 levels "101","102","103",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Year    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Week    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ p1sales : logi  NA NA NA NA NA NA ...
##  $ p2sales : logi  NA NA NA NA NA NA ...
##  $ p1price : logi  NA NA NA NA NA NA ...
##  $ p2price : logi  NA NA NA NA NA NA ...
##  $ p1prom  : logi  NA NA NA NA NA NA ...
##  $ p2prom  : logi  NA NA NA NA NA NA ...
##  $ country : Factor w/ 7 levels "AU","BR","CN",..: 7 7 7 7 7 7 7 7 7 7 ...

#if you want to check data use head or tail 
#head(store.df)   # defaults to 6 rows
#head(store.df, 120)  # 120 rows is enough to check 2 stores
#tail(store.df, 120)  # make sure end looks OK too


# set random seed to make the random sequences replicable
#今わ説明しません。好きな番号でいいです。
set.seed(98250) 

# promotion status, using binomial distribution, rbinom()
store.df$p1prom <- rbinom(n=nrow(store.df), size=1, p=0.1)  # 10% promoted
mean(store.df$p1prom) #check mean is 0.1

## [1] 0.1

store.df$p2prom <- rbinom(n=nrow(store.df), size=1, p=0.15) # 15%
mean(store.df$p2prom) #check mean is 0.15

## [1] 0.1384615

#head(store.df)

# prices
store.df$p1price <- sample(x=c(2.19, 2.29, 2.49, 2.79, 2.99), 
                           size=nrow(store.df), replace=TRUE)
unique(store.df$p1price) # check what sample did

## [1] 2.29 2.49 2.99 2.79 2.19

mean(store.df$p1price)

## [1] 2.544375

store.df$p2price <- sample(x=c(2.29, 2.49, 2.59, 2.99, 3.19), 
                           size=nrow(store.df), replace=TRUE)
head(store.df)

##   storeNum Year Week p1sales p2sales p1price p2price p1prom p2prom country
## 1      101    1    1      NA      NA    2.29    2.29      0      0      US
## 2      101    1    2      NA      NA    2.49    2.49      0      0      US
## 3      101    1    3      NA      NA    2.99    2.99      1      0      US
## 4      101    1    4      NA      NA    2.99    3.19      0      0      US
## 5      101    1    5      NA      NA    2.49    2.59      0      1      US
## 6      101    1    6      NA      NA    2.79    2.49      0      0      US

# sales data, using poisson (counts) distribution, rpois()
# first, the default sales in the absence of promotion
tmp.sales1 <- rpois(nrow(store.df), lambda=120)  # lambda = mean sales per week
tmp.sales2 <- rpois(nrow(store.df), lambda=100)  # lambda = mean sales per week

# scale sales according to the ratio of log(price)
tmp.sales1 <- tmp.sales1 * log(store.df$p2price) / log(store.df$p1price)
tmp.sales2 <- tmp.sales2 * log(store.df$p1price) / log(store.df$p2price)

# final sales get a 30% or 40% lift when promoted
store.df$p1sales <- floor(tmp.sales1 * (1 + store.df$p1prom*0.3))
store.df$p2sales <- floor(tmp.sales2 * (1 + store.df$p2prom*0.4))
#head(store.df)

# install.packages("car")  #only run once
library(car)
some(store.df, 3)

##      storeNum Year Week p1sales p2sales p1price p2price p1prom p2prom
## 27        101    1   27     135      99    2.29    2.49      0      0
## 144       102    1   40     123     113    2.79    2.59      0      0
## 1885      119    1   13     133     100    2.29    2.59      0      0
##      country
## 27        US
## 144       US
## 1885      CN

# command to write it out to a CSV:
# write.csv(store.df, file="rintro-chapter3.csv",row.names=FALSE)

############

Discrete Variables

table(store.df$p1price)

## 
## 2.19 2.29 2.49 2.79 2.99 
##  395  444  423  443  375

p1.table <- table(store.df$p1price)
p1.table

## 
## 2.19 2.29 2.49 2.79 2.99 
##  395  444  423  443  375

str(p1.table)

##  'table' int [1:5(1d)] 395 444 423 443 375
##  - attr(*, "dimnames")=List of 1
##   ..$ : chr [1:5] "2.19" "2.29" "2.49" "2.79" ...

plot(p1.table)

table(store.df$p1price, store.df$p1prom)

##       
##          0   1
##   2.19 354  41
##   2.29 398  46
##   2.49 381  42
##   2.79 396  47
##   2.99 343  32

p1.table2 <- table(store.df$p1price, store.df$p1prom)

#1がプロモションありです。プロモションした時の
p1.table2[, 2] / (p1.table2[, 1] + p1.table2[, 2])

##       2.19       2.29       2.49       2.79       2.99 
## 0.10379747 0.10360360 0.09929078 0.10609481 0.08533333

Continuous Variables

#For skewed and asymmetric distributions that are common in marketing, such as unit sales or household income, the arithmetic mean() and standard deviation sd() may be misleading; in those cases, the median() and interquartile range (IQR(), the range of the middle 50 % of data) are often more useful to summarize a distribution.

min(store.df$p1sales)

## [1] 73

max(store.df$p2sales)

## [1] 225

mean(store.df$p1prom)

## [1] 0.1

median(store.df$p2sales)

## [1] 96

var(store.df$p1sales)

## [1] 805.0044

sd(store.df$p1sales)

## [1] 28.3726

#Q3 - Q1
#Interquartile range, 75th–25th per- centile
IQR(store.df$p1sales) #default = type = 7  na.rm = FALSE

## [1] 37

#Q3とQ1をboxplotを使って確認してみましょう。
q3q1 <- boxplot(store.df$p1sales)$stats[c(4,2),]

q3q1[1]-q3q1[2]

## [1] 37

#中央絶対偏差Median Absolute Deviation
x <- store.df$p1sales
mad(x)  #qnorm(0.75) ≈ 1/1.4826 = zscore(0.67)

## [1] 26.6868

#First, determine how many standard deviations above the mean one would have to be to be in the 75th percentile. This can be found by using a z table and finding the z associated with 0.75. The value of z is 0.674. Thus, one must be .674 standard deviations above the mean to be in the 75th percentile.
1.4826*median(abs(x-median(x)))

## [1] 26.6868

#The MAD is the median of the absolute values of the differences between the data values and the overall median of the data set; for a Gaussian distribution, MAD is related to σ as \sigma \approx 1.4826\ \operatorname{MAD}. \,  (The derivation can be found here.)
1.4826*median(abs(x-median(x))) == mad(x)

## [1] TRUE

x.quantile <- quantile(store.df$p1sales, probs=c(0.25,0.50,0.75))   # interquartile range
median(store.df$p1sales)== x.quantile["50%"]

##  50% 
## TRUE

quantile(store.df$p1sales, probs=c(0.05, 0.95))  # central 90%

##  5% 95% 
##  93 184

quantile(store.df$p1sales, probs=0:10/10)

##    0%   10%   20%   30%   40%   50%   60%   70%   80%   90%  100% 
##  73.0 100.0 109.0 117.0 122.6 129.0 136.0 145.0 156.0 171.0 263.0

## Build a summary
mysummary.df <- data.frame(matrix(NA, nrow=2, ncol=2))
names(mysummary.df) <- c("Median Sales", "IQR")
rownames(mysummary.df) <- c("Product 1", "Product 2")
mysummary.df["Product 1", "Median Sales"] <- median(store.df$p1sales)
mysummary.df["Product 2", "Median Sales"] <- median(store.df$p2sales)
mysummary.df["Product 1", "IQR"] <- IQR(store.df$p1sales)
mysummary.df["Product 2", "IQR"] <- IQR(store.df$p2sales)
mysummary.df

##           Median Sales IQR
## Product 1          129  37
## Product 2           96  29

## Descriptive summaries
summary(store.df)

##     storeNum         Year          Week          p1sales   
##  101    : 104   Min.   :1.0   Min.   : 1.00   Min.   : 73  
##  102    : 104   1st Qu.:1.0   1st Qu.:13.75   1st Qu.:113  
##  103    : 104   Median :1.5   Median :26.50   Median :129  
##  104    : 104   Mean   :1.5   Mean   :26.50   Mean   :133  
##  105    : 104   3rd Qu.:2.0   3rd Qu.:39.25   3rd Qu.:150  
##  106    : 104   Max.   :2.0   Max.   :52.00   Max.   :263  
##  (Other):1456                                              
##     p2sales         p1price         p2price         p1prom   
##  Min.   : 51.0   Min.   :2.190   Min.   :2.29   Min.   :0.0  
##  1st Qu.: 84.0   1st Qu.:2.290   1st Qu.:2.49   1st Qu.:0.0  
##  Median : 96.0   Median :2.490   Median :2.59   Median :0.0  
##  Mean   :100.2   Mean   :2.544   Mean   :2.70   Mean   :0.1  
##  3rd Qu.:113.0   3rd Qu.:2.790   3rd Qu.:2.99   3rd Qu.:0.0  
##  Max.   :225.0   Max.   :2.990   Max.   :3.19   Max.   :1.0  
##                                                              
##      p2prom       country 
##  Min.   :0.0000   AU:104  
##  1st Qu.:0.0000   BR:208  
##  Median :0.0000   CN:208  
##  Mean   :0.1385   DE:520  
##  3rd Qu.:0.0000   GB:312  
##  Max.   :1.0000   JP:416  
##                   US:312

summary(store.df$Year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     1.0     1.5     1.5     2.0     2.0

#The digits= argument is helpful if you wish to change the precision of the display:
summary(store.df, digits=2)

##     storeNum         Year          Week       p1sales       p2sales   
##  101    : 104   Min.   :1.0   Min.   : 1   Min.   : 73   Min.   : 51  
##  102    : 104   1st Qu.:1.0   1st Qu.:14   1st Qu.:113   1st Qu.: 84  
##  103    : 104   Median :1.5   Median :26   Median :129   Median : 96  
##  104    : 104   Mean   :1.5   Mean   :26   Mean   :133   Mean   :100  
##  105    : 104   3rd Qu.:2.0   3rd Qu.:39   3rd Qu.:150   3rd Qu.:113  
##  106    : 104   Max.   :2.0   Max.   :52   Max.   :263   Max.   :225  
##  (Other):1456                                                         
##     p1price       p2price        p1prom        p2prom     country 
##  Min.   :2.2   Min.   :2.3   Min.   :0.0   Min.   :0.00   AU:104  
##  1st Qu.:2.3   1st Qu.:2.5   1st Qu.:0.0   1st Qu.:0.00   BR:208  
##  Median :2.5   Median :2.6   Median :0.0   Median :0.00   CN:208  
##  Mean   :2.5   Mean   :2.7   Mean   :0.1   Mean   :0.14   DE:520  
##  3rd Qu.:2.8   3rd Qu.:3.0   3rd Qu.:0.0   3rd Qu.:0.00   GB:312  
##  Max.   :3.0   Max.   :3.2   Max.   :1.0   Max.   :1.00   JP:416  
##                                                           US:312

# install.packages("psych")  #only run once
library(psych)
describe(store.df)

##           vars    n   mean    sd median trimmed   mad   min    max range
## storeNum*    1 2080  10.50  5.77  10.50   10.50  7.41  1.00  20.00  19.0
## Year         2 2080   1.50  0.50   1.50    1.50  0.74  1.00   2.00   1.0
## Week         3 2080  26.50 15.01  26.50   26.50 19.27  1.00  52.00  51.0
## p1sales      4 2080 133.05 28.37 129.00  131.08 26.69 73.00 263.00 190.0
## p2sales      5 2080 100.16 24.42  96.00   98.05 22.24 51.00 225.00 174.0
## p1price      6 2080   2.54  0.29   2.49    2.53  0.44  2.19   2.99   0.8
## p2price      7 2080   2.70  0.33   2.59    2.69  0.44  2.29   3.19   0.9
## p1prom       8 2080   0.10  0.30   0.00    0.00  0.00  0.00   1.00   1.0
## p2prom       9 2080   0.14  0.35   0.00    0.05  0.00  0.00   1.00   1.0
## country*    10 2080   4.55  1.72   4.50    4.62  2.22  1.00   7.00   6.0
##            skew kurtosis   se
## storeNum*  0.00    -1.21 0.13
## Year       0.00    -2.00 0.01
## Week       0.00    -1.20 0.33
## p1sales    0.74     0.66 0.62
## p2sales    0.99     1.51 0.54
## p1price    0.28    -1.44 0.01
## p2price    0.32    -1.40 0.01
## p1prom     2.66     5.10 0.01
## p2prom     2.09     2.38 0.01
## country*  -0.29    -0.81 0.04

describe(store.df[ , c(2, 4:9)])

##         vars    n   mean    sd median trimmed   mad   min    max range
## Year       1 2080   1.50  0.50   1.50    1.50  0.74  1.00   2.00   1.0
## p1sales    2 2080 133.05 28.37 129.00  131.08 26.69 73.00 263.00 190.0
## p2sales    3 2080 100.16 24.42  96.00   98.05 22.24 51.00 225.00 174.0
## p1price    4 2080   2.54  0.29   2.49    2.53  0.44  2.19   2.99   0.8
## p2price    5 2080   2.70  0.33   2.59    2.69  0.44  2.29   3.19   0.9
## p1prom     6 2080   0.10  0.30   0.00    0.00  0.00  0.00   1.00   1.0
## p2prom     7 2080   0.14  0.35   0.00    0.05  0.00  0.00   1.00   1.0
##         skew kurtosis   se
## Year    0.00    -2.00 0.01
## p1sales 0.74     0.66 0.62
## p2sales 0.99     1.51 0.54
## p1price 0.28    -1.44 0.01
## p2price 0.32    -1.40 0.01
## p1prom  2.66     5.10 0.01
## p2prom  2.09     2.38 0.01

apply()

# apply() [1 = rows] [2 = cols]
apply(store.df[, 2:9], MARGIN=2, FUN=mean)

##        Year        Week     p1sales     p2sales     p1price     p2price 
##   1.5000000  26.5000000 133.0485577 100.1567308   2.5443750   2.6995192 
##      p1prom      p2prom 
##   0.1000000   0.1384615

apply(store.df[, 2:9], 1, mean)[1:10]

##  [1] 29.9475 31.2475 32.9975 29.2725 31.2600 31.7850 27.5225 30.7850
##  [9] 28.0725 31.5600

apply(store.df[, 2:9], 2, sum)

##     Year     Week  p1sales  p2sales  p1price  p2price   p1prom   p2prom 
##   3120.0  55120.0 276741.0 208326.0   5292.3   5615.0    208.0    288.0

apply(store.df[, 2:9], 2, sd)

##       Year       Week    p1sales    p2sales    p1price    p2price 
##  0.5001202 15.0119401 28.3725990 24.4241905  0.2948819  0.3292181 
##     p1prom     p2prom 
##  0.3000721  0.3454668

apply(store.df[, 2:9], 2, function(x) { mean(x) - median(x) } )

##      Year      Week   p1sales   p2sales   p1price   p2price    p1prom 
## 0.0000000 0.0000000 4.0485577 4.1567308 0.0543750 0.1095192 0.1000000 
##    p2prom 
## 0.1384615

## creating a summary data frame using apply()
mysummary2.df <- data.frame(matrix(NA, nrow=2, ncol=2))
names(mysummary2.df) <- c("Median Sales", "IQR")
rownames(mysummary2.df) <- names(store.df)[4:5] # names from the data frame
mysummary2.df[, "Median Sales"] <- apply(store.df[, 4:5], 2, median)
mysummary2.df[, "IQR"]          <- apply(store.df[, 4:5], 2, IQR)
mysummary2.df

##         Median Sales IQR
## p1sales          129  37
## p2sales           96  29

single variable visualization

hist(store.df$p1sales)

## prettier version
hist(store.df$p1sales, 
     main="Product 1 Weekly Sales Frequencies, All Stores",    # add labels
     xlab="Product 1 Sales (Units)",
     ylab="Count" )

hist(store.df$p1sales, 
     main="Product 1 Weekly Sales Frequencies, All Stores",
     xlab="Product 1 Sales (Units)",
     ylab="Count",
     breaks=30,             # more columns 
     col="lightblue")       # color the bars

# relabeling the x axis

hist(store.df$p1sales, 
     main="Product 1 Weekly Sales Frequencies, All Stores\n\n",
     xlab="Product 1 Sales (Units)",
     ylab="Relative frequency",             # it's no londer the count!
     breaks=30, 
     col="lightblue", 
     freq=FALSE,                            # freq=FALSE means to plot density, not counts
     xaxt="n")                              # xaxt="n" means "x axis tick marks == no"

#[freq=FALSE] makes the Y axis comparable across different sized samples.

#side 1=below, 2=left, 3=above and 4=right.
axis(side=1, at=seq(60, 300, by=20))        # add the x axis (side=below)
#axis(side=3, at=seq(60, 300, by=20))        # add the x axis (side=top)
#axis(side=2, at=seq(0.00, 0.015, by=0.001))
axis(side=4, at=seq(0.00, 0.015, by=0.001))

# adding curves to the histogram

hist(store.df$p1sales, 
     main="Product 1 Weekly Sales Frequencies, All Stores",
     xlab="Product 1 Sales (Units)",
     ylab="Relative frequency",
     breaks=30, 
     col="lightblue", 
     freq=FALSE,                            # freq=FALSE means to plot density, not counts
     xaxt="n")                              # xaxt="n" means "x axis tick marks == no"

axis(side=1, at=seq(60, 300, by=20))        # add the x axis (side=1) tick marks we want


lines(density(store.df$p1sales, bw=10),    # "bw= ..." adjusts the smoothing
      type="l", col="darkred", lwd=2)      # lwd = line width

############
# boxplots and stripcharts

## boxplot

boxplot(store.df$p2sales, xlab="Weekly sales", ylab="P2",
        main="Weekly sales of P2, All stores", horizontal=TRUE)

boxplot(store.df$p2sales ~ store.df$storeNum, horizontal=TRUE,
     ylab="Store", xlab="Weekly unit sales", las=1,
     main="Weekly Sales of P2 by Store")

# using data=...
boxplot(p2sales ~ p2prom, data=store.df, horizontal=TRUE, yaxt="n", 
     ylab="P2 promoted in store?", xlab="Weekly sales",
     main="Weekly sales of P2 with and without promotion")
axis(side=2, at=c(1,2), labels=c("No", "Yes"))

### qq check for normality
#Quantile–quantile (QQ) plots are a good way to check one’s data against a distri- bution

qqnorm(store.df$p1sales)
##qqline adds a line to a “theoretical”, by default normal
##the solid line that represents an exact normal distribution.
qqline(store.df$p1sales)

### is more normal with log()
qqnorm(log(store.df$p1sales))
qqline(log(store.df$p1sales))

### another option using BoxCos 
qqnorm(scale(BoxCox(store.df$p1sales, BoxCox.lambda(x))))
qqline(scale(BoxCox(store.df$p1sales, BoxCox.lambda(x))))

### ecdf cumulative distribution plot
#very simple!!! cumulative proportion of data values in your sample
plot(ecdf(store.df$p1sales),
     main="Cumulative distribution of P1 Weekly Sales",
     ylab="Cumulative Proportion",
     xlab=c("P1 weekly sales, all stores", "90% of weeks sold <= 171 units"),
     yaxt="n")
axis(side=2, at=seq(0, 1, by=0.1), las=1, 
     labels=paste(seq(0,100,by=10), "%", sep=""))
# add lines for 90%
abline(h=0.9, lty=3)
# add lines for 70%
abline(h=0.7, lty=3)
# add lines for 50%
abline(h=0.5, lty=3)

#90% vertical line
abline(v=quantile(store.df$p1sales, pr=0.9), lty=3, col="lightblue")

### by() and aggregate()
by(store.df$p1sales, store.df$storeNum, mean)[c(1,2,3)]

## store.df$storeNum
##      101      102      103 
## 130.5385 134.7404 136.0385

by(store.df$p1sales, list(store.df$storeNum, store.df$Year), mean)[c(1,2,3),]

##            1        2
## 101 127.7885 133.2885
## 102 129.7115 139.7692
## 103 133.2308 138.8462

aggregate(store.df$p1sales, by=list(country=store.df$country), sum)

##   country     x
## 1      AU 14544
## 2      BR 27836
## 3      CN 27381
## 4      DE 68876
## 5      GB 40986
## 6      JP 55381
## 7      US 41737

world map

# aggregate the average per-store sales by country
p1sales.sum <- aggregate(store.df$p1sales, 
                         by=list(country=store.df$country), sum)

# install.packages(c("rworldmap", "RColorBrewer"))
library(rworldmap)

## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')

library(RColorBrewer)

# create map
p1sales.map <- joinCountryData2Map(p1sales.sum, joinCode = "ISO2", nameJoinColumn = "country")

## 7 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 236 codes from the map weren't represented in your data

mapCountryData(p1sales.map, nameColumnToPlot="x", 
               mapTitle="Total P1 sales by Country",
               colourPalette=brewer.pal(7, "Greens"), 
               catMethod="fixedWidth", addLegend=FALSE)

Reference

[How to Make Symbols on a Mac] http://www.wikihow.com/Make-Symbols-on-a-Mac

[Median Absolute Deviation]https://oku.edu.mie-u.ac.jp/~okumura/stat/basics.html