k.stores <- 20 # 20 stores
k.weeks <- 104 # 2 years of data each
store.df <- data.frame(matrix(NA, ncol=10, nrow=k.stores*k.weeks))
names(store.df) <- c("storeNum", "Year", "Week", "p1sales", "p2sales",
"p1price", "p2price", "p1prom", "p2prom", "country")
##次元(dimension)確認
dim(store.df)
## [1] 2080 10
#ストア番号は101から、 120まで
store.num <- 101:(100+k.stores)
#国設定
(store.cty <- c(rep("US", 3), rep("DE", 5), rep("GB", 3), rep("BR", 2),
rep("JP", 4), rep("AU", 1), rep("CN", 2)))
## [1] "US" "US" "US" "DE" "DE" "DE" "DE" "DE" "GB" "GB" "GB" "BR" "BR" "JP"
## [15] "JP" "JP" "JP" "AU" "CN" "CN"
length(store.cty) # make sure the country list is the right length
## [1] 20
store.df$storeNum <- rep(store.num, each=k.weeks)
store.df$country <- rep(store.cty, each=k.weeks)
# clean up、使った変数はコミ箱に!!
rm(store.num, store.cty)
store.df$Week <- rep(1:52, times=k.stores*2)
# try the inner parts of the next line to figure out how we use rep()
test <- (store.df$Year <- rep(rep(1:2, each=k.weeks/2), times=k.stores))
str(store.df)
## 'data.frame': 2080 obs. of 10 variables:
## $ storeNum: int 101 101 101 101 101 101 101 101 101 101 ...
## $ Year : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ p1sales : logi NA NA NA NA NA NA ...
## $ p2sales : logi NA NA NA NA NA NA ...
## $ p1price : logi NA NA NA NA NA NA ...
## $ p2price : logi NA NA NA NA NA NA ...
## $ p1prom : logi NA NA NA NA NA NA ...
## $ p2prom : logi NA NA NA NA NA NA ...
## $ country : chr "US" "US" "US" "US" ...
#facotr化する
store.df$storeNum <- factor(store.df$storeNum)
store.df$country <- factor(store.df$country)
#内容確認
str(store.df)
## 'data.frame': 2080 obs. of 10 variables:
## $ storeNum: Factor w/ 20 levels "101","102","103",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Year : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Week : int 1 2 3 4 5 6 7 8 9 10 ...
## $ p1sales : logi NA NA NA NA NA NA ...
## $ p2sales : logi NA NA NA NA NA NA ...
## $ p1price : logi NA NA NA NA NA NA ...
## $ p2price : logi NA NA NA NA NA NA ...
## $ p1prom : logi NA NA NA NA NA NA ...
## $ p2prom : logi NA NA NA NA NA NA ...
## $ country : Factor w/ 7 levels "AU","BR","CN",..: 7 7 7 7 7 7 7 7 7 7 ...
#if you want to check data use head or tail
#head(store.df) # defaults to 6 rows
#head(store.df, 120) # 120 rows is enough to check 2 stores
#tail(store.df, 120) # make sure end looks OK too
# set random seed to make the random sequences replicable
#今わ説明しません。好きな番号でいいです。
set.seed(98250)
# promotion status, using binomial distribution, rbinom()
store.df$p1prom <- rbinom(n=nrow(store.df), size=1, p=0.1) # 10% promoted
mean(store.df$p1prom) #check mean is 0.1
## [1] 0.1
store.df$p2prom <- rbinom(n=nrow(store.df), size=1, p=0.15) # 15%
mean(store.df$p2prom) #check mean is 0.15
## [1] 0.1384615
#head(store.df)
# prices
store.df$p1price <- sample(x=c(2.19, 2.29, 2.49, 2.79, 2.99),
size=nrow(store.df), replace=TRUE)
unique(store.df$p1price) # check what sample did
## [1] 2.29 2.49 2.99 2.79 2.19
mean(store.df$p1price)
## [1] 2.544375
store.df$p2price <- sample(x=c(2.29, 2.49, 2.59, 2.99, 3.19),
size=nrow(store.df), replace=TRUE)
head(store.df)
## storeNum Year Week p1sales p2sales p1price p2price p1prom p2prom country
## 1 101 1 1 NA NA 2.29 2.29 0 0 US
## 2 101 1 2 NA NA 2.49 2.49 0 0 US
## 3 101 1 3 NA NA 2.99 2.99 1 0 US
## 4 101 1 4 NA NA 2.99 3.19 0 0 US
## 5 101 1 5 NA NA 2.49 2.59 0 1 US
## 6 101 1 6 NA NA 2.79 2.49 0 0 US
# sales data, using poisson (counts) distribution, rpois()
# first, the default sales in the absence of promotion
tmp.sales1 <- rpois(nrow(store.df), lambda=120) # lambda = mean sales per week
tmp.sales2 <- rpois(nrow(store.df), lambda=100) # lambda = mean sales per week
# scale sales according to the ratio of log(price)
tmp.sales1 <- tmp.sales1 * log(store.df$p2price) / log(store.df$p1price)
tmp.sales2 <- tmp.sales2 * log(store.df$p1price) / log(store.df$p2price)
# final sales get a 30% or 40% lift when promoted
store.df$p1sales <- floor(tmp.sales1 * (1 + store.df$p1prom*0.3))
store.df$p2sales <- floor(tmp.sales2 * (1 + store.df$p2prom*0.4))
#head(store.df)
# install.packages("car") #only run once
library(car)
some(store.df, 3)
## storeNum Year Week p1sales p2sales p1price p2price p1prom p2prom
## 27 101 1 27 135 99 2.29 2.49 0 0
## 144 102 1 40 123 113 2.79 2.59 0 0
## 1885 119 1 13 133 100 2.29 2.59 0 0
## country
## 27 US
## 144 US
## 1885 CN
# command to write it out to a CSV:
# write.csv(store.df, file="rintro-chapter3.csv",row.names=FALSE)
############
table(store.df$p1price)
##
## 2.19 2.29 2.49 2.79 2.99
## 395 444 423 443 375
p1.table <- table(store.df$p1price)
p1.table
##
## 2.19 2.29 2.49 2.79 2.99
## 395 444 423 443 375
str(p1.table)
## 'table' int [1:5(1d)] 395 444 423 443 375
## - attr(*, "dimnames")=List of 1
## ..$ : chr [1:5] "2.19" "2.29" "2.49" "2.79" ...
plot(p1.table)
table(store.df$p1price, store.df$p1prom)
##
## 0 1
## 2.19 354 41
## 2.29 398 46
## 2.49 381 42
## 2.79 396 47
## 2.99 343 32
p1.table2 <- table(store.df$p1price, store.df$p1prom)
#1がプロモションありです。プロモションした時の
p1.table2[, 2] / (p1.table2[, 1] + p1.table2[, 2])
## 2.19 2.29 2.49 2.79 2.99
## 0.10379747 0.10360360 0.09929078 0.10609481 0.08533333
#For skewed and asymmetric distributions that are common in marketing, such as unit sales or household income, the arithmetic mean() and standard deviation sd() may be misleading; in those cases, the median() and interquartile range (IQR(), the range of the middle 50 % of data) are often more useful to summarize a distribution.
min(store.df$p1sales)
## [1] 73
max(store.df$p2sales)
## [1] 225
mean(store.df$p1prom)
## [1] 0.1
median(store.df$p2sales)
## [1] 96
var(store.df$p1sales)
## [1] 805.0044
sd(store.df$p1sales)
## [1] 28.3726
#Q3 - Q1
#Interquartile range, 75th–25th per- centile
IQR(store.df$p1sales) #default = type = 7 na.rm = FALSE
## [1] 37
#Q3とQ1をboxplotを使って確認してみましょう。
q3q1 <- boxplot(store.df$p1sales)$stats[c(4,2),]
q3q1[1]-q3q1[2]
## [1] 37
#中央絶対偏差Median Absolute Deviation
x <- store.df$p1sales
mad(x) #qnorm(0.75) ≈ 1/1.4826 = zscore(0.67)
## [1] 26.6868
#First, determine how many standard deviations above the mean one would have to be to be in the 75th percentile. This can be found by using a z table and finding the z associated with 0.75. The value of z is 0.674. Thus, one must be .674 standard deviations above the mean to be in the 75th percentile.
1.4826*median(abs(x-median(x)))
## [1] 26.6868
#The MAD is the median of the absolute values of the differences between the data values and the overall median of the data set; for a Gaussian distribution, MAD is related to σ as \sigma \approx 1.4826\ \operatorname{MAD}. \, (The derivation can be found here.)
1.4826*median(abs(x-median(x))) == mad(x)
## [1] TRUE
x.quantile <- quantile(store.df$p1sales, probs=c(0.25,0.50,0.75)) # interquartile range
median(store.df$p1sales)== x.quantile["50%"]
## 50%
## TRUE
quantile(store.df$p1sales, probs=c(0.05, 0.95)) # central 90%
## 5% 95%
## 93 184
quantile(store.df$p1sales, probs=0:10/10)
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 73.0 100.0 109.0 117.0 122.6 129.0 136.0 145.0 156.0 171.0 263.0
## Build a summary
mysummary.df <- data.frame(matrix(NA, nrow=2, ncol=2))
names(mysummary.df) <- c("Median Sales", "IQR")
rownames(mysummary.df) <- c("Product 1", "Product 2")
mysummary.df["Product 1", "Median Sales"] <- median(store.df$p1sales)
mysummary.df["Product 2", "Median Sales"] <- median(store.df$p2sales)
mysummary.df["Product 1", "IQR"] <- IQR(store.df$p1sales)
mysummary.df["Product 2", "IQR"] <- IQR(store.df$p2sales)
mysummary.df
## Median Sales IQR
## Product 1 129 37
## Product 2 96 29
## Descriptive summaries
summary(store.df)
## storeNum Year Week p1sales
## 101 : 104 Min. :1.0 Min. : 1.00 Min. : 73
## 102 : 104 1st Qu.:1.0 1st Qu.:13.75 1st Qu.:113
## 103 : 104 Median :1.5 Median :26.50 Median :129
## 104 : 104 Mean :1.5 Mean :26.50 Mean :133
## 105 : 104 3rd Qu.:2.0 3rd Qu.:39.25 3rd Qu.:150
## 106 : 104 Max. :2.0 Max. :52.00 Max. :263
## (Other):1456
## p2sales p1price p2price p1prom
## Min. : 51.0 Min. :2.190 Min. :2.29 Min. :0.0
## 1st Qu.: 84.0 1st Qu.:2.290 1st Qu.:2.49 1st Qu.:0.0
## Median : 96.0 Median :2.490 Median :2.59 Median :0.0
## Mean :100.2 Mean :2.544 Mean :2.70 Mean :0.1
## 3rd Qu.:113.0 3rd Qu.:2.790 3rd Qu.:2.99 3rd Qu.:0.0
## Max. :225.0 Max. :2.990 Max. :3.19 Max. :1.0
##
## p2prom country
## Min. :0.0000 AU:104
## 1st Qu.:0.0000 BR:208
## Median :0.0000 CN:208
## Mean :0.1385 DE:520
## 3rd Qu.:0.0000 GB:312
## Max. :1.0000 JP:416
## US:312
summary(store.df$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 1.0 1.5 1.5 2.0 2.0
#The digits= argument is helpful if you wish to change the precision of the display:
summary(store.df, digits=2)
## storeNum Year Week p1sales p2sales
## 101 : 104 Min. :1.0 Min. : 1 Min. : 73 Min. : 51
## 102 : 104 1st Qu.:1.0 1st Qu.:14 1st Qu.:113 1st Qu.: 84
## 103 : 104 Median :1.5 Median :26 Median :129 Median : 96
## 104 : 104 Mean :1.5 Mean :26 Mean :133 Mean :100
## 105 : 104 3rd Qu.:2.0 3rd Qu.:39 3rd Qu.:150 3rd Qu.:113
## 106 : 104 Max. :2.0 Max. :52 Max. :263 Max. :225
## (Other):1456
## p1price p2price p1prom p2prom country
## Min. :2.2 Min. :2.3 Min. :0.0 Min. :0.00 AU:104
## 1st Qu.:2.3 1st Qu.:2.5 1st Qu.:0.0 1st Qu.:0.00 BR:208
## Median :2.5 Median :2.6 Median :0.0 Median :0.00 CN:208
## Mean :2.5 Mean :2.7 Mean :0.1 Mean :0.14 DE:520
## 3rd Qu.:2.8 3rd Qu.:3.0 3rd Qu.:0.0 3rd Qu.:0.00 GB:312
## Max. :3.0 Max. :3.2 Max. :1.0 Max. :1.00 JP:416
## US:312
# install.packages("psych") #only run once
library(psych)
describe(store.df)
## vars n mean sd median trimmed mad min max range
## storeNum* 1 2080 10.50 5.77 10.50 10.50 7.41 1.00 20.00 19.0
## Year 2 2080 1.50 0.50 1.50 1.50 0.74 1.00 2.00 1.0
## Week 3 2080 26.50 15.01 26.50 26.50 19.27 1.00 52.00 51.0
## p1sales 4 2080 133.05 28.37 129.00 131.08 26.69 73.00 263.00 190.0
## p2sales 5 2080 100.16 24.42 96.00 98.05 22.24 51.00 225.00 174.0
## p1price 6 2080 2.54 0.29 2.49 2.53 0.44 2.19 2.99 0.8
## p2price 7 2080 2.70 0.33 2.59 2.69 0.44 2.29 3.19 0.9
## p1prom 8 2080 0.10 0.30 0.00 0.00 0.00 0.00 1.00 1.0
## p2prom 9 2080 0.14 0.35 0.00 0.05 0.00 0.00 1.00 1.0
## country* 10 2080 4.55 1.72 4.50 4.62 2.22 1.00 7.00 6.0
## skew kurtosis se
## storeNum* 0.00 -1.21 0.13
## Year 0.00 -2.00 0.01
## Week 0.00 -1.20 0.33
## p1sales 0.74 0.66 0.62
## p2sales 0.99 1.51 0.54
## p1price 0.28 -1.44 0.01
## p2price 0.32 -1.40 0.01
## p1prom 2.66 5.10 0.01
## p2prom 2.09 2.38 0.01
## country* -0.29 -0.81 0.04
describe(store.df[ , c(2, 4:9)])
## vars n mean sd median trimmed mad min max range
## Year 1 2080 1.50 0.50 1.50 1.50 0.74 1.00 2.00 1.0
## p1sales 2 2080 133.05 28.37 129.00 131.08 26.69 73.00 263.00 190.0
## p2sales 3 2080 100.16 24.42 96.00 98.05 22.24 51.00 225.00 174.0
## p1price 4 2080 2.54 0.29 2.49 2.53 0.44 2.19 2.99 0.8
## p2price 5 2080 2.70 0.33 2.59 2.69 0.44 2.29 3.19 0.9
## p1prom 6 2080 0.10 0.30 0.00 0.00 0.00 0.00 1.00 1.0
## p2prom 7 2080 0.14 0.35 0.00 0.05 0.00 0.00 1.00 1.0
## skew kurtosis se
## Year 0.00 -2.00 0.01
## p1sales 0.74 0.66 0.62
## p2sales 0.99 1.51 0.54
## p1price 0.28 -1.44 0.01
## p2price 0.32 -1.40 0.01
## p1prom 2.66 5.10 0.01
## p2prom 2.09 2.38 0.01
# apply() [1 = rows] [2 = cols]
apply(store.df[, 2:9], MARGIN=2, FUN=mean)
## Year Week p1sales p2sales p1price p2price
## 1.5000000 26.5000000 133.0485577 100.1567308 2.5443750 2.6995192
## p1prom p2prom
## 0.1000000 0.1384615
apply(store.df[, 2:9], 1, mean)[1:10]
## [1] 29.9475 31.2475 32.9975 29.2725 31.2600 31.7850 27.5225 30.7850
## [9] 28.0725 31.5600
apply(store.df[, 2:9], 2, sum)
## Year Week p1sales p2sales p1price p2price p1prom p2prom
## 3120.0 55120.0 276741.0 208326.0 5292.3 5615.0 208.0 288.0
apply(store.df[, 2:9], 2, sd)
## Year Week p1sales p2sales p1price p2price
## 0.5001202 15.0119401 28.3725990 24.4241905 0.2948819 0.3292181
## p1prom p2prom
## 0.3000721 0.3454668
apply(store.df[, 2:9], 2, function(x) { mean(x) - median(x) } )
## Year Week p1sales p2sales p1price p2price p1prom
## 0.0000000 0.0000000 4.0485577 4.1567308 0.0543750 0.1095192 0.1000000
## p2prom
## 0.1384615
## creating a summary data frame using apply()
mysummary2.df <- data.frame(matrix(NA, nrow=2, ncol=2))
names(mysummary2.df) <- c("Median Sales", "IQR")
rownames(mysummary2.df) <- names(store.df)[4:5] # names from the data frame
mysummary2.df[, "Median Sales"] <- apply(store.df[, 4:5], 2, median)
mysummary2.df[, "IQR"] <- apply(store.df[, 4:5], 2, IQR)
mysummary2.df
## Median Sales IQR
## p1sales 129 37
## p2sales 96 29
hist(store.df$p1sales)
## prettier version
hist(store.df$p1sales,
main="Product 1 Weekly Sales Frequencies, All Stores", # add labels
xlab="Product 1 Sales (Units)",
ylab="Count" )
hist(store.df$p1sales,
main="Product 1 Weekly Sales Frequencies, All Stores",
xlab="Product 1 Sales (Units)",
ylab="Count",
breaks=30, # more columns
col="lightblue") # color the bars
# relabeling the x axis
hist(store.df$p1sales,
main="Product 1 Weekly Sales Frequencies, All Stores\n\n",
xlab="Product 1 Sales (Units)",
ylab="Relative frequency", # it's no londer the count!
breaks=30,
col="lightblue",
freq=FALSE, # freq=FALSE means to plot density, not counts
xaxt="n") # xaxt="n" means "x axis tick marks == no"
#[freq=FALSE] makes the Y axis comparable across different sized samples.
#side 1=below, 2=left, 3=above and 4=right.
axis(side=1, at=seq(60, 300, by=20)) # add the x axis (side=below)
#axis(side=3, at=seq(60, 300, by=20)) # add the x axis (side=top)
#axis(side=2, at=seq(0.00, 0.015, by=0.001))
axis(side=4, at=seq(0.00, 0.015, by=0.001))
# adding curves to the histogram
hist(store.df$p1sales,
main="Product 1 Weekly Sales Frequencies, All Stores",
xlab="Product 1 Sales (Units)",
ylab="Relative frequency",
breaks=30,
col="lightblue",
freq=FALSE, # freq=FALSE means to plot density, not counts
xaxt="n") # xaxt="n" means "x axis tick marks == no"
axis(side=1, at=seq(60, 300, by=20)) # add the x axis (side=1) tick marks we want
lines(density(store.df$p1sales, bw=10), # "bw= ..." adjusts the smoothing
type="l", col="darkred", lwd=2) # lwd = line width
############
# boxplots and stripcharts
## boxplot
boxplot(store.df$p2sales, xlab="Weekly sales", ylab="P2",
main="Weekly sales of P2, All stores", horizontal=TRUE)
boxplot(store.df$p2sales ~ store.df$storeNum, horizontal=TRUE,
ylab="Store", xlab="Weekly unit sales", las=1,
main="Weekly Sales of P2 by Store")
# using data=...
boxplot(p2sales ~ p2prom, data=store.df, horizontal=TRUE, yaxt="n",
ylab="P2 promoted in store?", xlab="Weekly sales",
main="Weekly sales of P2 with and without promotion")
axis(side=2, at=c(1,2), labels=c("No", "Yes"))
### qq check for normality
#Quantile–quantile (QQ) plots are a good way to check one’s data against a distri- bution
qqnorm(store.df$p1sales)
##qqline adds a line to a “theoretical”, by default normal
##the solid line that represents an exact normal distribution.
qqline(store.df$p1sales)
### is more normal with log()
qqnorm(log(store.df$p1sales))
qqline(log(store.df$p1sales))
### another option using BoxCos
qqnorm(scale(BoxCox(store.df$p1sales, BoxCox.lambda(x))))
qqline(scale(BoxCox(store.df$p1sales, BoxCox.lambda(x))))
### ecdf cumulative distribution plot
#very simple!!! cumulative proportion of data values in your sample
plot(ecdf(store.df$p1sales),
main="Cumulative distribution of P1 Weekly Sales",
ylab="Cumulative Proportion",
xlab=c("P1 weekly sales, all stores", "90% of weeks sold <= 171 units"),
yaxt="n")
axis(side=2, at=seq(0, 1, by=0.1), las=1,
labels=paste(seq(0,100,by=10), "%", sep=""))
# add lines for 90%
abline(h=0.9, lty=3)
# add lines for 70%
abline(h=0.7, lty=3)
# add lines for 50%
abline(h=0.5, lty=3)
#90% vertical line
abline(v=quantile(store.df$p1sales, pr=0.9), lty=3, col="lightblue")
### by() and aggregate()
by(store.df$p1sales, store.df$storeNum, mean)[c(1,2,3)]
## store.df$storeNum
## 101 102 103
## 130.5385 134.7404 136.0385
by(store.df$p1sales, list(store.df$storeNum, store.df$Year), mean)[c(1,2,3),]
## 1 2
## 101 127.7885 133.2885
## 102 129.7115 139.7692
## 103 133.2308 138.8462
aggregate(store.df$p1sales, by=list(country=store.df$country), sum)
## country x
## 1 AU 14544
## 2 BR 27836
## 3 CN 27381
## 4 DE 68876
## 5 GB 40986
## 6 JP 55381
## 7 US 41737
# aggregate the average per-store sales by country
p1sales.sum <- aggregate(store.df$p1sales,
by=list(country=store.df$country), sum)
# install.packages(c("rworldmap", "RColorBrewer"))
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(RColorBrewer)
# create map
p1sales.map <- joinCountryData2Map(p1sales.sum, joinCode = "ISO2", nameJoinColumn = "country")
## 7 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 236 codes from the map weren't represented in your data
mapCountryData(p1sales.map, nameColumnToPlot="x",
mapTitle="Total P1 sales by Country",
colourPalette=brewer.pal(7, "Greens"),
catMethod="fixedWidth", addLegend=FALSE)
[How to Make Symbols on a Mac] http://www.wikihow.com/Make-Symbols-on-a-Mac
[Median Absolute Deviation]https://oku.edu.mie-u.ac.jp/~okumura/stat/basics.html