Exploring Data


Visually checking relationships between two variables



Line plots

library(ggplot2)
x <- runif(100)
y <- x^2 + 0.2*x

ggplot(data.frame(x=x, y=y), aes(x=x, y=y)) + geom_line()



Scatter plots

customer_data <- read.table('custdata.tsv', header=T, sep='\t')

customer_data2 <- subset(customer_data, 
                         (customer_data$age > 0 & 
                            customer_data$age <100 & 
                            customer_data$income >0))

cor(customer_data2$age, customer_data2$income)
## [1] -0.02240845
ggplot(customer_data2, aes(x=age, y=income)) + 
  geom_point() + ylim(0, 200000)
## Warning: Removed 32 rows containing missing values (geom_point).

Notice the stat_smooth(method="lm") here.

ggplot(customer_data2, aes(x=age, y=income)) +
  geom_point() + stat_smooth(method="lm") +
  ylim(0, 200000)
## Warning: Removed 32 rows containing missing values (stat_smooth).
## Warning: Removed 32 rows containing missing values (geom_point).

Notice that we’re using geom_smooth() here.

ggplot(customer_data2, aes(x=age, y=income)) +
  geom_point() + geom_smooth() +
  ylim(0, 200000)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 32 rows containing missing values (stat_smooth).
## Warning: Removed 32 rows containing missing values (geom_point).

ggplot(customer_data2, aes(x=age, y=as.numeric(health.ins))) +
  geom_point(position=position_jitter(w=0.05, h=0.05)) + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

Hexbin plots

Above we didn’t have too many points to plot so its still legible, but what if we had too many? That’s where hexbin plots come in.

library(hexbin)

ggplot(customer_data2, aes(x=age, y=income)) +
  geom_hex(binwidth=c(5, 10000)) +
  geom_smooth(color="white", se=F) +
  ylim(0, 200000)
## Warning: Removed 32 rows containing missing values (stat_hexbin).
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## Warning: Removed 32 rows containing missing values (stat_smooth).

Bar charts for two categorical variables

Stacked

ggplot(customer_data) + 
  geom_bar(aes(x=marital.stat, fill=health.ins))

Side-by-side

ggplot(customer_data) + 
  geom_bar(aes(x=marital.stat, fill=health.ins), position="dodge")

Filled bar chart

ggplot(customer_data) + 
  geom_bar(aes(x=marital.stat, fill=health.ins), position="fill")

Adding a rug

ggplot(customer_data, aes(x=marital.stat)) +
  geom_bar(aes(fill=health.ins), position="fill") +
  geom_point(aes(y=-0.05), size=0.75, alpha=0.3, position=position_jitter(h=0.01))

ggplot(customer_data2) +
  geom_bar(aes(x=housing.type, fill=marital.stat), position="dodge") +
  theme(axis.text.x=element_text(angle=45, hjust=1))

Faceted bar chart

ggplot(customer_data2) +
  geom_bar(aes(x=marital.stat), position="dodge", fill="darkgray") +
  facet_wrap(~housing.type, scales="free_y") +
  theme(axis.text.x = element_text(angle=45, hjust=1))