This is a data visualization exercise based on the West Roxbury House Prices dataset.The purpose of this exercise is to show how to create various graphs such as scatter plot, box plot, histogram, and heat map using R.

df <- mlba::WestRoxbury
names(df)[3]<- "LOT.SIZE"  #change the column name if so desired
names(df) <- tolower(names(df)) #change to lowercase if so desired

Box plots for quantitative variables

opar<- par(no.readonly=TRUE) 
par(mar=c(2,4,2,2))  #Resize the boundary of the graph
par(mfrow=c(2,2)) #Create a graph matrix populated by rows, with rows in rows and columns in ncols
boxplot(df$total.value,col="cyan",border="blue",main="Distribution of Total Value",ylab="in 000s")
f <- factor(df$remodel)
boxplot(df$total.value~f,df,col=c("red","green","yellow"),border = "blue",main="Total Value vs. Remodel Types",ylab="in 000s")
boxplot(df$lot.size,col="orange",border = "blue",main="Distribution of Lot Size",ylab="Lot Size")
boxplot(df$living.area,col="cyan",border = "blue",main="Distribution of Living Area",ylab="Living Area")

par(opar)

Histograms

opar<- par(no.readonly=TRUE) 
par(mar=c(2,4,2,2)) 
par(mfrow=c(2,2)) 
hist(df$total.value,breaks=20,col="red",main="Histogram of Total Value",xlab="total.value",ylab="count")
hist(df$lot.size,breaks=40,col="purple",main="Histogram of Lot Size",xlab="lot.size",ylab="count")
hist(df$gross.area,breaks=20,col="green",main="Histogram of Gross Area",xlab="gross.area",ylab="count")
hist(df$living.area,breaks=20,col="yellow",main="Histogram of Living Area",xlab="living.area",ylab="count")

par(opar)

Scatter plots exploring associations

library(ggplot2)
library(cowplot)
p1<-ggplot(df, aes( x = lot.size, y = total.value)) + 
  geom_point(size=2) + 
  geom_smooth(method = 'lm', formula = y ~ x, se = T)+   
  scale_y_continuous(breaks = c(250,500,750,1000,1250),labels = c("$250","$500","$750","$1000","$1250")) 
p2<-ggplot(df,aes(x = lot.size, y = total.value)) + 
  geom_point(aes( color = factor(floors)), size=2) + 
  geom_smooth(method = 'lm', formula = y ~ x, se = T)+
  theme(legend.position = "bottom" ,legend.box = "horizontal")+
  scale_y_continuous(breaks = c(250,500,750,1000,1250),labels = c("$250","$500","$750","$1000","$1250")) 
p3<-ggplot(df, aes( x = living.area, y = total.value)) + 
  geom_point( aes( color = factor(floors) ), size=2) + 
  geom_smooth(method = 'lm', formula = y ~ x, se = T)+
  theme(legend.position = "none")+
  scale_y_continuous(breaks = c(250,500,750,1000,1250),labels = c("$250","$500","$750","$1000","$1250")) 
p4<-ggplot(df, aes( x = rooms, y = total.value)) + 
  geom_point( aes( color = factor(floors) ), size=2) + 
  geom_smooth(method = 'lm', formula = y ~ x, se = T)+
  theme(legend.position = "none")+
  scale_y_continuous(breaks = c(250,500,750,1000,1250),labels = c("$250","$500","$750","$1000","$1250")) 
p <- cowplot::plot_grid(p1, p2, p3, p4, nrow = 2, labels = LETTERS[1:4])
p

Bar charts

library(ggplot2)
library(cowplot)
Avg.value_1 <- aggregate(df$total.value, by=list(type=df$remodel),mean)
names(Avg.value_1) <- c("remodel","Avg.Value")
p1 <- ggplot(data = Avg.value_1, mapping = aes(x = remodel, y = Avg.Value)) + 
  geom_col(aes(fill=LETTERS[1:3]))+
  geom_text(aes(label=round(Avg.Value)),size=3)+
  scale_y_continuous(breaks = c(0,100,200,300,400),labels = c("$0","$100","$200","$300","$400")) + theme(legend.position = "none")

Avg.value_2 <- aggregate(df$total.value, by=list(type=df$floors),mean)
names(Avg.value_2) <- c("floors","Avg.Value")
p2 <- ggplot(data = Avg.value_2, mapping = aes(x = floors, y = Avg.Value)) + 
  geom_col(aes(fill=LETTERS[1:5]))+
  geom_text(aes(label=round(Avg.Value)),size=3)+
  scale_y_continuous(breaks = c(0,200,400),labels = c("$0","$200","$400")) + 
  theme(legend.position = "none")

Avg.value_3 <- aggregate(df$total.value, by=list(type=df$rooms),mean)
names(Avg.value_3) <- c("rooms","Avg.Value")
p3 <- ggplot(data = Avg.value_3, mapping = aes(x = rooms, y = Avg.Value)) + geom_col(aes(fill=LETTERS[1:12]))+
  geom_text(aes(label=round(Avg.Value)),size=3)+
  scale_y_continuous(breaks = c(0,200,400,600),labels = c("$0","$200","$400","$600"))+
  theme(legend.position = "none")

Avg.value_4 <- aggregate(df$total.value, by=list(type=df$bedrooms),mean)
names(Avg.value_4) <- c("bedrooms","Avg.Value")
p4 <- ggplot(data = Avg.value_4, mapping = aes(x = bedrooms, y = Avg.Value)) + 
  geom_col(aes(fill=LETTERS[1:9]))+
  geom_text(aes(label=round(Avg.Value)),size=3) + scale_y_continuous(breaks = c(0,250,500,750),labels = c("$0","$250","$500","$750")) + 
  theme(legend.position = "none")
p <- cowplot::plot_grid(p1, p2, p3, p4, nrow = 2, labels = LETTERS[1:4])
p

Heat map showing pairwise correlations

attach(df)
library(gplots)
## 
## 载入程辑包:'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
df2 <- df[,c(1,3,5:9)]
heatmap.2(cor(df2), Rowv=FALSE, Colv=FALSE, dendrogram="none",
          cellnote=round(cor(df2),2),
          notecol="black", key=FALSE, trace="none", margins=c(10,10),
          main="Correlation Values")

detach(df)

Parallel Coordinate Plots

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(viridis)
## 载入需要的程辑包:viridisLite
data <- subset(df,bedrooms<=4)
data$bedrooms <- factor(data$bedrooms)
p1<-ggparcoord(data,scale="globalminmax",showPoints = TRUE,title = "No scaling", columns = c(6,1,5,3), groupColumn = 9,alphaLines = 0.1)+
  scale_color_viridis(discrete=TRUE)+
  theme(plot.title = element_text(size=10),legend.position = "bottom")+
  xlab("")
p2<-ggparcoord(data,scale="uniminmax",showPoints = TRUE,title = "Standardize to Min=0 and Max=1", columns = c(6,1,5,3), groupColumn = 9,alphaLines = 0.1)+
  scale_color_viridis(discrete=TRUE)+xlab("")+
  theme(plot.title = element_text(size=10),legend.position = "none")+
  xlab("")
p3<- ggparcoord(data,scale="std",showPoints = TRUE,title = "Normolize univariately (substract mean & divide by sd)", columns = c(6,1,5,3), groupColumn = 9,alphaLines = 0.1)+
  scale_color_viridis(discrete=TRUE)+
  theme(plot.title = element_text(size=10),legend.position = "none")+
  xlab("")
p4<-ggparcoord(data,scale="center",showPoints = TRUE,title = "Standardize and center variable", columns = c(6,1,5,3), groupColumn = 9,alphaLines = 0.1)+
  scale_color_viridis(discrete=TRUE)+
  theme(plot.title = element_text(size=10),legend.position = "none")+
  xlab("")
p <- cowplot::plot_grid(p1, p2, p3, p4, nrow = 2, labels = "AUTO")
p  

Mosaic Plot

attach(df)
library(vcd)
## 载入需要的程辑包:grid
table<-xtabs(~floors+remodel, df)
mosaic( ~remodel+floors,table,shade=TRUE,legend=TRUE,color=TRUE)

detach(df)

## Cleveland Dot Plot using Cereals dataset

CR <- mlba::Cereals
attach(CR)
options(repr.plot.width = 1, repr.plot.height =40)
ggplot(CR,aes(y = reorder(name,calories),x = calories))+
  geom_point(shape=16,size=1)+
  xlab("calories")+
  ylab("Name of cereal")+
  theme(axis.text=element_text(size=4.5,face = "bold"))

detach(CR)