Chapter 5 -assessing differences between groups

5.1 Comparing Groups: Tables and Visualization
5.2 Finding Descriptives by Group

5.1 Comparing Groups: Tables and Visualization

# Names of the variables we will define for each segment
segVars <- c("age", "gender", "income", "kids", "ownHome", "subscribe")

# the data type for each segment
segVarType <- c("norm", "binom", "norm", "pois", "binom", "binom")

# names of the segments
segNames <- c("Suburb mix", "Urban hip", "Travelers", "Moving up")

# the size of each segment (N)
segSize <- c(100, 50, 80, 70)

# the means for each variable for each segment
segMeans <- matrix( c(
  40, .5, 55000, 2, .5, .1,
  24, .7, 21000, 1, .2, .2,
  58, .5, 64000, 0, .7, .05,
  36, .3, 52000, 2, .3, .2  ), ncol=length(segVars), byrow=TRUE)
colnames(segMeans) = segVars
segMeans

##      age gender income kids ownHome subscribe
## [1,]  40    0.5  55000    2     0.5      0.10
## [2,]  24    0.7  21000    1     0.2      0.20
## [3,]  58    0.5  64000    0     0.7      0.05
## [4,]  36    0.3  52000    2     0.3      0.20

# the standard deviations for each segment (NA = not applicable for the variable)
segSDs <- matrix( c(
  5, NA, 12000, NA, NA, NA,
  2, NA,  5000, NA, NA, NA,
  8, NA, 21000, NA, NA, NA,
  4, NA, 10000, NA, NA, NA  ), ncol=length(segVars), byrow=TRUE)
colnames(segSDs) = segVars
segSDs

##      age gender income kids ownHome subscribe
## [1,]   5     NA  12000   NA      NA        NA
## [2,]   2     NA   5000   NA      NA        NA
## [3,]   8     NA  21000   NA      NA        NA
## [4,]   4     NA  10000   NA      NA        NA

### 5.1.2 Language Brief: for() Loops
(i.seq <- rep(sqrt(seq(from=2.1, to=6.2, by=1.7)), 3))

## [1] 1.449138 1.949359 2.345208 1.449138 1.949359 2.345208 1.449138 1.949359
## [9] 2.345208

for (i in i.seq ) { print(i) }

## [1] 1.449138
## [1] 1.949359
## [1] 2.345208
## [1] 1.449138
## [1] 1.949359
## [1] 2.345208
## [1] 1.449138
## [1] 1.949359
## [1] 2.345208

for (i in c("Hello ","world, ","welcome to R!")) { cat(i) }

## Hello world, welcome to R!

for (i in 1:length(i.seq)) { cat("Entry", i, "=", i.seq[i], "\n") }

## Entry 1 = 1.449138 
## Entry 2 = 1.949359 
## Entry 3 = 2.345208 
## Entry 4 = 1.449138 
## Entry 5 = 1.949359 
## Entry 6 = 2.345208 
## Entry 7 = 1.449138 
## Entry 8 = 1.949359 
## Entry 9 = 2.345208

for (i in seq_along(i.seq)) { cat("Entry", i, "=", i.seq[i], "\n") }

## Entry 1 = 1.449138 
## Entry 2 = 1.949359 
## Entry 3 = 2.345208 
## Entry 4 = 1.449138 
## Entry 5 = 1.949359 
## Entry 6 = 2.345208 
## Entry 7 = 1.449138 
## Entry 8 = 1.949359 
## Entry 9 = 2.345208

#for loopの注意点
#naとNULLのlengthを見る
c(na = length(NA),null = length(NULL))

##   na null 
##    1    0

#for loopで実行される、1:length(x)は何回回す？
c(1:length(NA))

## [1] 1

#!!!!!!1:0
c(1:length(NULL))

## [1] 1 0

# lengthがNULLの場合は
i.seq <- NULL
for (i in 1:length(i.seq)) { print (i) }     # bad

## [1] 1
## [1] 0

# lengthがNAの場合
i.seq <- NA
for (i in 1:length(i.seq)) { print (i) }　　 # bad

## [1] 1

# NA,NULL除く: use seq_along()
for (i in seq_along(NULL)) { print (i) }     # better

# NA含め, NULL除く: use seq()
for (i in seq(NA)) { print (i) }     # better

## [1] 1

for (i in seq(NULL)) { print (i) }     # better

#it tests whether x[1] > 1 and then x[2] > 1 and so forth
#x is vector, not scalar. 
x <- 1:5
if (x > 1) {      # bad code -- will produce warning!
  print ("hi") 
} else { 
  print ("bye") 
}

## Warning in if (x > 1) {: the condition has length > 1 and only the first
## element will be used

## [1] "bye"

ifelse(x > 1, "hi", "bye") # better

## [1] "bye" "hi"  "hi"  "hi"  "hi"

#functionを使う時
fn.hi  <- function(x) { paste(x,"dan") }
fn.bye <- function(x) { paste(x,"dan") }
ifelse(x > 2, fn.hi("hi"), fn.bye("bye") )

## [1] "bye dan" "bye dan" "hi dan"  "hi dan"  "hi dan"

###5.1.4 Final Segment Data Generation
# make sure we're starting our dataset from a known state
seg.df <- NULL
set.seed(02554)

# iterate over all the segments and create data for each
for (i in seq_along(segNames)) {    
  cat(i, segNames[i], "\n")
  cat(i, segSize[i], "\n")
  cat(i, segVars[i], "\n")

  # create an empty matrix to hold this particular segment's data
  this.seg <- data.frame(matrix(NA, nrow=segSize[i], ncol=length(segVars)))

  # within a segment, iterate over the variables and draw appropriate random data
  for (j in seq_along(segVars)) {    # and iterate over each variable
    if (segVarType[j] == "norm") {   # draw random normals
      this.seg[, j] <- rnorm(segSize[i], mean=segMeans[i, j], sd=segSDs[i, j])
    } else if (segVarType[j] == "pois") {    # draw counts
      this.seg[, j] <- rpois(segSize[i], lambda=segMeans[i, j])
    } else if (segVarType[j] == "binom") {   # draw binomials
      this.seg[, j] <- rbinom(segSize[i], size=1, prob=segMeans[i, j])
    } else {
      stop("Bad segment data type: ", segVarType[j])
    }
  }
  # add this segment to the total dataset
  seg.df <- rbind(seg.df, this.seg)     
}

## 1 Suburb mix 
## 1 100 
## 1 age 
## 2 Urban hip 
## 2 50 
## 2 gender 
## 3 Travelers 
## 3 80 
## 3 income 
## 4 Moving up 
## 4 70 
## 4 kids

# make the data frame names match what we defined
names(seg.df) <- segVars
# add segment membership for each row
seg.df$Segment   <- factor(rep(segNames, times=segSize))
# convert the binomial variables to nicely labeled factors
seg.df$ownHome   <- factor(seg.df$ownHome, labels=c("ownNo", "ownYes"))
seg.df$gender    <- factor(seg.df$gender, labels=c("Female", "Male"))
seg.df$subscribe <- factor(seg.df$subscribe, labels=c("subNo", "subYes"))

# check the data and confirm it
summary(seg.df)

##       age           gender        income            kids        ownHome   
##  Min.   :19.26   Female:157   Min.   : -5183   Min.   :0.00   ownNo :159  
##  1st Qu.:33.01   Male  :143   1st Qu.: 39656   1st Qu.:0.00   ownYes:141  
##  Median :39.49                Median : 52014   Median :1.00               
##  Mean   :41.20                Mean   : 50937   Mean   :1.27               
##  3rd Qu.:47.90                3rd Qu.: 61403   3rd Qu.:2.00               
##  Max.   :80.49                Max.   :114278   Max.   :7.00               
##   subscribe         Segment   
##  subNo :260   Moving up : 70  
##  subYes: 40   Suburb mix:100  
##               Travelers : 80  
##               Urban hip : 50  
##                               
##

5.2 Finding Descriptives by Group

mean(seg.df$income[seg.df$Segment == "Moving up"])

## [1] 53090.97

mean(seg.df$income[seg.df$Segment == "Moving up" & 
                   seg.df$subscribe=="subNo"])

## [1] 53633.73

by(seg.df$income, seg.df$Segment, mean)

## seg.df$Segment: Moving up
## [1] 53090.97
## -------------------------------------------------------- 
## seg.df$Segment: Suburb mix
## [1] 55033.82
## -------------------------------------------------------- 
## seg.df$Segment: Travelers
## [1] 62213.94
## -------------------------------------------------------- 
## seg.df$Segment: Urban hip
## [1] 21681.93

by(seg.df$income, list(seg.df$Segment, seg.df$subscribe), mean)

## : Moving up
## : subNo
## [1] 53633.73
## -------------------------------------------------------- 
## : Suburb mix
## : subNo
## [1] 54942.69
## -------------------------------------------------------- 
## : Travelers
## : subNo
## [1] 62746.11
## -------------------------------------------------------- 
## : Urban hip
## : subNo
## [1] 22082.11
## -------------------------------------------------------- 
## : Moving up
## : subYes
## [1] 50919.89
## -------------------------------------------------------- 
## : Suburb mix
## : subYes
## [1] 56461.41
## -------------------------------------------------------- 
## : Travelers
## : subYes
## [1] 58488.77
## -------------------------------------------------------- 
## : Urban hip
## : subYes
## [1] 20081.19

aggregate(seg.df$income, list(seg.df$Segment), mean)

##      Group.1        x
## 1  Moving up 53090.97
## 2 Suburb mix 55033.82
## 3  Travelers 62213.94
## 4  Urban hip 21681.93

seg.income.mean <- aggregate(seg.df$income, list(seg.df$Segment), mean)
seg.df$segIncome <- seg.income.mean[seg.df$Segment, 2]

car::some(seg.df)

##          age gender   income kids ownHome subscribe    Segment segIncome
## 58  34.46528   Male 60971.76    2   ownNo     subNo Suburb mix  55033.82
## 79  42.31337   Male 49674.79    0  ownYes     subNo Suburb mix  55033.82
## 124 22.30333 Female 24541.24    1   ownNo     subNo  Urban hip  21681.93
## 136 23.08861   Male 33909.50    3   ownNo     subNo  Urban hip  21681.93
## 158 43.35230   Male 51787.88    0   ownNo     subNo  Travelers  62213.94
## 186 48.84991 Female 59075.12    0   ownNo     subNo  Travelers  62213.94
## 232 37.85733   Male 50980.48    5  ownYes     subNo  Moving up  53090.97
## 243 36.97339 Female 53792.40    0   ownNo     subNo  Moving up  53090.97
## 257 43.30929 Female 69549.93    3   ownNo    subYes  Moving up  53090.97
## 288 35.82586 Female 41766.29    2   ownNo    subYes  Moving up  53090.97

seg.df$Segment

##   [1] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##   [7] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [13] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [19] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [25] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [31] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [37] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [43] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [49] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [55] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [61] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [67] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [73] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [79] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [85] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [91] Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix Suburb mix
##  [97] Suburb mix Suburb mix Suburb mix Suburb mix Urban hip  Urban hip 
## [103] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [109] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [115] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [121] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [127] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [133] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [139] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [145] Urban hip  Urban hip  Urban hip  Urban hip  Urban hip  Urban hip 
## [151] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [157] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [163] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [169] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [175] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [181] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [187] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [193] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [199] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [205] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [211] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [217] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [223] Travelers  Travelers  Travelers  Travelers  Travelers  Travelers 
## [229] Travelers  Travelers  Moving up  Moving up  Moving up  Moving up 
## [235] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [241] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [247] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [253] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [259] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [265] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [271] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [277] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [283] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [289] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## [295] Moving up  Moving up  Moving up  Moving up  Moving up  Moving up 
## Levels: Moving up Suburb mix Travelers Urban hip

#!!!!!! how it works?
car::some(seg.income.mean[seg.df$Segment, ],20)

##         Group.1        x
## 2.20 Suburb mix 55033.82
## 2.47 Suburb mix 55033.82
## 2.52 Suburb mix 55033.82
## 2.65 Suburb mix 55033.82
## 2.70 Suburb mix 55033.82
## 2.83 Suburb mix 55033.82
## 2.85 Suburb mix 55033.82
## 2.90 Suburb mix 55033.82
## 4.41  Urban hip 21681.93
## 3.6   Travelers 62213.94
## 3.20  Travelers 62213.94
## 3.22  Travelers 62213.94
## 3.33  Travelers 62213.94
## 3.34  Travelers 62213.94
## 3.51  Travelers 62213.94
## 3.57  Travelers 62213.94
## 3.72  Travelers 62213.94
## 1.32  Moving up 53090.97
## 1.37  Moving up 53090.97
## 1.59  Moving up 53090.97

#The result is a data frame in which each row of seg.income.mean occurs many times in the order requested.
car::some(seg.income.mean[seg.df$Segment, 2],20)

##  [1] 55033.82 55033.82 55033.82 55033.82 55033.82 21681.93 21681.93
##  [8] 62213.94 62213.94 62213.94 62213.94 62213.94 53090.97 53090.97
## [15] 53090.97 53090.97 53090.97 53090.97 53090.97 53090.97

#one more time with another sample data,
(test = data.frame(group = c("A","B"), value= c(1:10)))

##    group value
## 1      A     1
## 2      B     2
## 3      A     3
## 4      B     4
## 5      A     5
## 6      B     6
## 7      A     7
## 8      B     8
## 9      A     9
## 10     B    10

#factor works
#一番上に一致するfactorのレコードを返す(valueが1,2のみ)
test[factor(rep(c("A","B"),5)),]

##     group value
## 1       A     1
## 2       B     2
## 1.1     A     1
## 2.1     B     2
## 1.2     A     1
## 2.2     B     2
## 1.3     A     1
## 2.3     B     2
## 1.4     A     1
## 2.4     B     2

#integer also works
test[as.integer(factor(rep(c("A","B"),5))),]

##     group value
## 1       A     1
## 2       B     2
## 1.1     A     1
## 2.1     B     2
## 1.2     A     1
## 2.2     B     2
## 1.3     A     1
## 2.3     B     2
## 1.4     A     1
## 2.4     B     2

#character dosn't work
test[c("A","B"),]

##      group value
## NA    <NA>    NA
## NA.1  <NA>    NA

#結論:ちゃんとわかって使わないと、変な結果が出るので気をつけるように

# clear that variable
seg.df$segIncome <- NULL

#### formula version
#response variables on the left from explanatory variables on the right.
#take income by Segment within the data set seg.df, and apply mean to each group
aggregate(income ~ Segment, data=seg.df, mean)

##      Segment   income
## 1  Moving up 53090.97
## 2 Suburb mix 55033.82
## 3  Travelers 62213.94
## 4  Urban hip 21681.93

##########
# two-way data aggregation
#by list
aggregate(seg.df$income, list(seg.df$Segment, seg.df$ownHome), mean)

##      Group.1 Group.2        x
## 1  Moving up   ownNo 54497.68
## 2 Suburb mix   ownNo 54932.83
## 3  Travelers   ownNo 63188.42
## 4  Urban hip   ownNo 21337.59
## 5  Moving up  ownYes 50216.37
## 6 Suburb mix  ownYes 55143.21
## 7  Travelers  ownYes 61889.12
## 8  Urban hip  ownYes 23059.27

#by formula
aggregate(income ~ Segment + ownHome, data=seg.df, mean)

##      Segment ownHome   income
## 1  Moving up   ownNo 54497.68
## 2 Suburb mix   ownNo 54932.83
## 3  Travelers   ownNo 63188.42
## 4  Urban hip   ownNo 21337.59
## 5  Moving up  ownYes 50216.37
## 6 Suburb mix  ownYes 55143.21
## 7  Travelers  ownYes 61889.12
## 8  Urban hip  ownYes 23059.27

#Multiple variable formula
aggregate(income ~ Segment + ownHome + subscribe, data=seg.df, mean)

##       Segment ownHome subscribe   income
## 1   Moving up   ownNo     subNo 55402.89
## 2  Suburb mix   ownNo     subNo 54579.99
## 3   Travelers   ownNo     subNo 65852.54
## 4   Urban hip   ownNo     subNo 21604.16
## 5   Moving up  ownYes     subNo 49898.85
## 6  Suburb mix  ownYes     subNo 55354.86
## 7   Travelers  ownYes     subNo 61749.71
## 8   Urban hip  ownYes     subNo 23993.93
## 9   Moving up   ownNo    subYes 50675.70
## 10 Suburb mix   ownNo    subYes 63753.97
## 11  Travelers   ownNo    subYes 48091.75
## 12  Urban hip   ownNo    subYes 20271.33
## 13  Moving up  ownYes    subYes 51359.44
## 14 Suburb mix  ownYes    subYes 52815.13
## 15  Travelers  ownYes    subYes 62944.64
## 16  Urban hip  ownYes    subYes 19320.64

agg.data <- aggregate(income ~ Segment + ownHome, data=seg.df, mean)
agg.data[2, ]

##      Segment ownHome   income
## 2 Suburb mix   ownNo 54932.83

agg.data[2, 3]

## [1] 54932.83

# Count of factor level occurence by factor
#not bad
table(seg.df$Segment, ownHome = seg.df$ownHome)

##             ownHome
##              ownNo ownYes
##   Moving up     47     23
##   Suburb mix    52     48
##   Travelers     20     60
##   Urban hip     40     10

#better
with(seg.df, table(Segment, ownHome))

##             ownHome
## Segment      ownNo ownYes
##   Moving up     47     23
##   Suburb mix    52     48
##   Travelers     20     60
##   Urban hip     40     10

#multi dimentional array
temp = with(seg.df, table(Segment, ownHome, subscribe))
temp

## , , subscribe = subNo
## 
##             ownHome
## Segment      ownNo ownYes
##   Moving up     38     18
##   Suburb mix    50     44
##   Travelers     17     53
##   Urban hip     32      8
## 
## , , subscribe = subYes
## 
##             ownHome
## Segment      ownNo ownYes
##   Moving up      9      5
##   Suburb mix     2      4
##   Travelers      3      7
##   Urban hip      8      2

temp[,,1]

##             ownHome
## Segment      ownNo ownYes
##   Moving up     38     18
##   Suburb mix    50     44
##   Travelers     17     53
##   Urban hip     32      8

temp[,,2]

##             ownHome
## Segment      ownNo ownYes
##   Moving up      9      5
##   Suburb mix     2      4
##   Travelers      3      7
##   Urban hip      8      2

with(seg.df, table(kids, Segment))

##     Segment
## kids Moving up Suburb mix Travelers Urban hip
##    0        13         11        80        17
##    1        17         36         0        17
##    2        18         22         0        11
##    3        13         19         0         4
##    4         5          7         0         1
##    5         3          3         0         0
##    6         0          2         0         0
##    7         1          0         0         0

# total of variables by factor, total kids by Segment
xtabs(kids ~ Segment, data=seg.df)

## Segment
##  Moving up Suburb mix  Travelers  Urban hip 
##        134        192          0         55

table(seg.df$kids, seg.df$Segment)

##    
##     Moving up Suburb mix Travelers Urban hip
##   0        13         11        80        17
##   1        17         36         0        17
##   2        18         22         0        11
##   3        13         19         0         4
##   4         5          7         0         1
##   5         3          3         0         0
##   6         0          2         0         0
##   7         1          0         0         0

aggregate(kids ~ Segment, data=seg.df, sum)

##      Segment kids
## 1  Moving up  134
## 2 Suburb mix  192
## 3  Travelers    0
## 4  Urban hip   55

#Another option is to multiply the frequency table by marginal number of kids and add it up:
seg.tab <- with(seg.df, table(kids, Segment))
apply(seg.tab*0:7, 2, sum)

##  Moving up Suburb mix  Travelers  Urban hip 
##        134        192          0         55

colSums(seg.tab*0:7)

##  Moving up Suburb mix  Travelers  Urban hip 
##        134        192          0         55

#### visualize counts by group

# histogram by 1 factor
lattice::histogram(~subscribe | Segment, data=seg.df)

with(seg.df, table(Segment, subscribe)/as.integer(table(Segment)))

##             subscribe
## Segment      subNo subYes
##   Moving up  0.800  0.200
##   Suburb mix 0.940  0.060
##   Travelers  0.875  0.125
##   Urban hip  0.800  0.200

#prop.table can do this easier



# counts instead of proportions, and some visual options
lattice::histogram(~subscribe | Segment, data=seg.df, type="count", 
          layout=c(4,1), col=c("burlywood", "darkolivegreen"))

with(seg.df, table(Segment, subscribe))

##             subscribe
## Segment      subNo subYes
##   Moving up     56     14
##   Suburb mix    94      6
##   Travelers     70     10
##   Urban hip     40     10

# histogram by 2 factors
lattice::histogram(~subscribe | Segment + ownHome, data=seg.df)

#?how to make this data

# use prop.table to get just positive proportion
prop.table(table(seg.df$subscribe, seg.df$Segment), margin=2)

##         
##          Moving up Suburb mix Travelers Urban hip
##   subNo      0.800      0.940     0.875     0.800
##   subYes     0.200      0.060     0.125     0.200

prop.table(with(seg.df, table(subscribe, Segment)), margin=2)

##          Segment
## subscribe Moving up Suburb mix Travelers Urban hip
##    subNo      0.800      0.940     0.875     0.800
##    subYes     0.200      0.060     0.125     0.200

lattice::barchart(prop.table(table(seg.df$subscribe, seg.df$Segment), margin=2)[2, ], 
          xlab="Subscriber proportion by Segment", col="darkolivegreen")

#with generic function
par(mfrow=c(1,1), mar=c(3, 7, 3, 3))
barplot(prop.table(table(seg.df$subscribe, seg.df$Segment), margin=2)[2, ],
  horiz=TRUE,
  col="darkolivegreen",
  las=2
)

par(old.par)


#### visualize continuous data by group

## bar chart for continuous variable, the "spreadsheet" way to graph it
# aggregate our data
seg.mean <- aggregate(income ~ Segment, data=seg.df, mean)
lattice::barchart(income~Segment, data=seg.mean, col="grey")

seg.income.agg <- aggregate(income ~ Segment + ownHome, data=seg.df, mean)
# then plot it
lattice::barchart(income ~ Segment, data=seg.income.agg, 
         groups=ownHome, auto.key=TRUE,
         par.settings = lattice::simpleTheme(col=c("gray95", "gray50"))
)

## better = boxplot for continuous variable

# base graphics way to do this

temp <- boxplot(income ~ Segment, data=seg.df, yaxt="n", ylab="Income ($k)")
ax.seq <- seq(from=0, to=120000, by=20000)
axis(side=2, at=ax.seq, labels=paste(ax.seq/1000, "k", sep=""), las=1)

#四分位数
iqr_stats <- temp$stats
colnames(iqr_stats) <- temp$names
rownames(iqr_stats) <- c("not min??", "第1四分位数","中央値","第3四分位数", "not max??")
print(iqr_stats)

##             Moving up Suburb mix Travelers Urban hip
## not min??    29771.91   28270.15  15346.68  11985.25
## 第1四分位数  46540.88   48057.69  48549.67  17784.57
## 中央値       52564.55   54819.01  61014.30  22141.01
## 第3四分位数  58933.34   61339.16  77359.70  24533.93
## not max??    73797.50   81041.99 114278.26  33909.50

# lattice gives more options, especially for multiway breakouts ("conditioning")
lattice::bwplot(Segment ~ income, data=seg.df, horizontal=TRUE, xlab = "Income")

# add conditioning variable
lattice::bwplot(Segment ~ income | ownHome, data=seg.df, horizontal=TRUE, xlab="Income")