I love barplots! Here’s a simple function that allows me to do some quick 2-way exploration of my data when I have categorical variables.
Let’s give ourself some play-data:
gender = c("Male", "Female")
age_band = c("[16-24]", "[25-49]", "50+")
buying_ratio = c("1.Low", "2.Medium", "3.High")
n = 100
gender_v = sample(gender, 100, replace=T)
age_band_v = sample(age_band, 100, replace=T)
buying_ratio = sample(buying_ratio, 100, replace=T)
df = data.frame(gender=gender_v, age_band=age_band_v, buying_ratio=buying_ratio)
df
## gender age_band buying_ratio
## 1 Female [16-24] 3.High
## 2 Male 50+ 1.Low
## 3 Female [25-49] 1.Low
## 4 Male [25-49] 1.Low
## 5 Male [16-24] 1.Low
## 6 Male [16-24] 1.Low
## 7 Male 50+ 1.Low
## 8 Male [25-49] 2.Medium
## 9 Male [16-24] 2.Medium
## 10 Female [25-49] 3.High
## 11 Male 50+ 1.Low
## 12 Male [25-49] 2.Medium
## 13 Female [25-49] 1.Low
## 14 Female [25-49] 1.Low
## 15 Male [25-49] 1.Low
## 16 Female [25-49] 3.High
## 17 Male [16-24] 2.Medium
## 18 Male 50+ 3.High
## 19 Male [25-49] 2.Medium
## 20 Female [16-24] 3.High
## 21 Female [16-24] 1.Low
## 22 Female [25-49] 2.Medium
## 23 Female [16-24] 3.High
## 24 Female 50+ 3.High
## 25 Female [25-49] 2.Medium
## 26 Male [25-49] 3.High
## 27 Male [16-24] 1.Low
## 28 Male 50+ 1.Low
## 29 Female 50+ 1.Low
## 30 Male [16-24] 1.Low
## 31 Male [25-49] 2.Medium
## 32 Female [25-49] 3.High
## 33 Female 50+ 2.Medium
## 34 Female [25-49] 1.Low
## 35 Male [25-49] 1.Low
## 36 Male [16-24] 2.Medium
## 37 Female [25-49] 3.High
## 38 Female [16-24] 1.Low
## 39 Male [25-49] 1.Low
## 40 Male [25-49] 2.Medium
## 41 Male [25-49] 2.Medium
## 42 Male [25-49] 1.Low
## 43 Male 50+ 3.High
## 44 Male 50+ 3.High
## 45 Male 50+ 3.High
## 46 Male 50+ 1.Low
## 47 Female [16-24] 1.Low
## 48 Male [16-24] 3.High
## 49 Male 50+ 2.Medium
## 50 Female [25-49] 3.High
## 51 Male 50+ 2.Medium
## 52 Female [16-24] 3.High
## 53 Female 50+ 3.High
## 54 Male [25-49] 3.High
## 55 Male [25-49] 2.Medium
## 56 Male 50+ 1.Low
## 57 Male [16-24] 3.High
## 58 Male 50+ 2.Medium
## 59 Female [25-49] 2.Medium
## 60 Female 50+ 2.Medium
## 61 Male [25-49] 3.High
## 62 Female [25-49] 1.Low
## 63 Female [25-49] 2.Medium
## 64 Male [25-49] 3.High
## 65 Male [16-24] 3.High
## 66 Male 50+ 2.Medium
## 67 Male [25-49] 3.High
## 68 Male 50+ 3.High
## 69 Female [25-49] 3.High
## 70 Male [25-49] 2.Medium
## 71 Female [16-24] 3.High
## 72 Male [16-24] 3.High
## 73 Female 50+ 1.Low
## 74 Male [25-49] 3.High
## 75 Female [16-24] 2.Medium
## 76 Female 50+ 2.Medium
## 77 Male [25-49] 1.Low
## 78 Female [25-49] 2.Medium
## 79 Male [25-49] 3.High
## 80 Female [25-49] 2.Medium
## 81 Male 50+ 1.Low
## 82 Female 50+ 1.Low
## 83 Female [16-24] 2.Medium
## 84 Female [25-49] 3.High
## 85 Male [16-24] 1.Low
## 86 Male 50+ 2.Medium
## 87 Female [16-24] 1.Low
## 88 Male [16-24] 1.Low
## 89 Female [16-24] 2.Medium
## 90 Female 50+ 3.High
## 91 Male [25-49] 3.High
## 92 Female [25-49] 1.Low
## 93 Male [25-49] 3.High
## 94 Female [16-24] 3.High
## 95 Female [25-49] 3.High
## 96 Female 50+ 3.High
## 97 Female [16-24] 1.Low
## 98 Male [25-49] 1.Low
## 99 Male 50+ 2.Medium
## 100 Female [16-24] 2.Medium
and a function to visualize conditional distributions:
plot_var = function(varname, varname2, col){
var_data = t(table(df[,varname], df[,varname2]))
var_data_ordered = var_data[order(rownames(var_data)),]
bar_heights = sapply(colnames(var_data_ordered), function(x) cumsum(var_data_ordered[,x]))
bar_incr = rbind(bar_heights[1,], diff(bar_heights))
percentages = apply(bar_incr, 2, function(x) paste(round(x/sum(x), 2)*100, "%"))
ypos = bar_heights - bar_incr/2
bar_widths = apply(var_data, 2, sum)
bp = barplot(var_data_ordered, main=paste(varname2, "by", varname),
names=paste(colnames(var_data), "(", bar_widths, ")"),
beside=F, col=col,
legend=rownames(var_data), args.legend=list(x="topleft", cex=0.6, inset=c(0, -0.05)),
width=bar_widths)
i=1
for(xpos in bp){
text(xpos, ypos[,i], percentages[,i])
i = i + 1
}
}
We can call the function like so:
library(RColorBrewer)
plot_var("gender", "buying_ratio", brewer.pal(3, "Oranges"))
and the other way around
plot_var("buying_ratio", "gender", c("indianred1", "lightblue2"))