library(dplyr)
library(ggplot2)
W = read.csv('wholesales.csv')
W$Channel = factor( paste0("Ch",W$Channel) )
W$Region = factor( paste0("Reg",W$Region) )
W[3:8] = lapply(W[3:6], log, base=10)
names(W)[7] = "Detergents"
summary(W)
##  Channel    Region        Fresh             Milk          Grocery      
##  Ch1:298   Reg1: 77   Min.   :0.4771   Min.   :1.740   Min.   :0.4771  
##  Ch2:142   Reg2: 47   1st Qu.:3.4952   1st Qu.:3.186   1st Qu.:3.3330  
##            Reg3:316   Median :3.9296   Median :3.560   Median :3.6772  
##                       Mean   :3.7916   Mean   :3.527   Mean   :3.6660  
##                       3rd Qu.:4.2288   3rd Qu.:3.857   3rd Qu.:4.0276  
##                       Max.   :5.0498   Max.   :4.866   Max.   :4.9675  
##      Frozen        Detergents       Delicassen   
##  Min.   :1.398   Min.   :0.4771   Min.   :1.740  
##  1st Qu.:2.871   1st Qu.:3.4952   1st Qu.:3.186  
##  Median :3.184   Median :3.9296   Median :3.560  
##  Mean   :3.171   Mean   :3.7916   Mean   :3.527  
##  3rd Qu.:3.551   3rd Qu.:4.2288   3rd Qu.:3.857  
##  Max.   :4.784   Max.   :5.0498   Max.   :4.866

單一變數的變化

# 類別變數:
table(W$Channel)
## 
## Ch1 Ch2 
## 298 142
table(W$Channel)%>% barplot

# 數量變數:
summary(W$Fresh)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.4771  3.4952  3.9296  3.7916  4.2288  5.0498
hist(W$Fresh)


兩個變數之間的關係

# 類別 x 類別
table(W$Channel, W$Region) %>% barplot()

# 類別 x 數量
tapply(W$Milk, W$Region, sum)
##      Reg1      Reg2      Reg3 
##  270.0807  163.2922 1118.4746
tapply(W$Milk,list(W$Channel,W$Region),mean)
##         Reg1     Reg2     Reg3
## Ch1 3.368636 3.227366 3.346987
## Ch2 3.962843 3.838205 3.926289
ggplot(W, aes(x=log(Milk))) +
  geom_histogram(aes(fill=Region), alpha=0.5, bins=20) +
  facet_grid(Channel~Region) +
  labs(title="Dist. of Sales of Milk")

# 數量 x 數量
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point(size=2) +
  stat_smooth(method="lm",se=F)

兩個數量之間的關係 在 不同族群之中是否相同

# (數量 x 數量) by 類別
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point(size=2) +
  stat_smooth(method="lm", se=F) +
  facet_grid(~Channel)

ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point(size=2) +
  stat_smooth(method="lm", se=F) +
  facet_grid(~Region)

# (數量 x 數量) by (類別 x 類別)
ggplot(W, aes(x=log(Milk), y=log(Fresh))) +
  geom_point(size=2) +
  stat_smooth(method="lm", se=F, col='red') +
  facet_grid(Channel~Region)