Data Visualization basic graphics vs ggplot2

#Data Visualization basic graphics vs ggplot2

library(ggplot2)

#-------------------------------------------------------------------------------------
#Scatter plot
#-------------------------------------------------------------------------------------


#The BOD data frame has 6 rows and 2 columns giving 
#the biochemical oxygen demand versus time in an evaluation of water quality.
print(BOD)

##   Time demand
## 1    1    8.3
## 2    2   10.3
## 3    3   19.0
## 4    4   16.0
## 5    5   15.6
## 6    7   19.8

str(BOD)

## 'data.frame':    6 obs. of  2 variables:
##  $ Time  : num  1 2 3 4 5 7
##  $ demand: num  8.3 10.3 19 16 15.6 19.8
##  - attr(*, "reference")= chr "A1.4, p. 270"

plot(BOD)

plot(BOD$demand,BOD$Time)

plot(BOD,ann=F)
title(main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")
plot(BOD,main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")

#we can store plots into variables
pz<-plot(BOD)

qplot(BOD$Time,BOD$demand,geom="point")

quickplot(BOD$Time,BOD$demand,geom="point")

qplot(Time,demand,data=BOD,geom="point")

qplot(Time,demand,data=BOD,geom="point",main="Biochemical Oxygen Demand",xlab="Time in Days",ylab="Oxygen Demand")

ggplot(BOD)#This is the data layer only datasets ie dataframe

ggplot(BOD)+aes(Time,demand)#Add coordinate aesthetics

ggplot(BOD)+aes(Time,demand)+geom_point()#Add marker geometry

p1<-ggplot(BOD)+aes(Time,demand)#Save to variable
p1+geom_point()#Do the math ie add

p1+geom_area()#easily change and it will be short ie reader friendly

ggplot(data=BOD,aes(Time,demand))+geom_point()#convention1 recommended

ggplot(data=BOD)+geom_point(aes(Time,demand))#convention2 can also be used

p2<-ggplot(data=BOD,aes(Time,demand))+geom_point()
print(p2)

p2+ggtitle(label="Biochemical Oxygen Demand",subtitle="Water Quality Test")+xlab("Time in Days")+ylab("Oxygen Demand")

p2+labs(title="Biochemical Oxygen Demand",subtitle = "Water Quality Test",x="Time in Days",y="Oxygen Demand",caption = "Data Source:caption",tag="tag: subplot 1")

#-------------------------------------------------------------------------------------
#Line plot ie scatterplot with line
#-------------------------------------------------------------------------------------


plot(BOD,type="l")
title(main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")

plot(BOD,type="l",main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")

qplot(BOD$Time,BOD$demand,geom="line")

qplot(Time,demand,data=BOD,geom="line")

qplot(Time,demand,data=BOD,geom="line",main="Biochemical Oxygen Demand",xlab="Time in Days",ylab="Oxygen Demand")

ggplot(data=BOD,aes(Time,demand))+geom_line()

p1<-ggplot(data=BOD,aes(Time,demand))+geom_line()
p1+ggtitle(label="Biochemical Oxygen Demand",subtitle="Water Quality Test")+xlab("Time in Days")+ylab("Oxygen Demand")

p1+labs(title="Biochemical Oxygen Demand",subtitle = "Water Quality Test",x="Time in Days",y="Oxygen Demand",caption = "Data Source:caption",tag="tag: subplot 1")

#-------------------------------------------------------------------------------------
#Barplot (#Column or Vertical barchart)
#-------------------------------------------------------------------------------------

plot(iris$Species)#input factor-character data
title(main="Barplot showing count of samples",sub="50 samples each of three iris plants",xlab="Category of iris plant",ylab="Number of samples taken")

barplot(table(iris$Species))#input table-character data
title(main="Barplot showing count of samples",sub="50 samples each of three iris plants",xlab="Category of iris plant",ylab="Number of samples taken")

barplot(women$height,names.arg = women$weight)#numeric vector
title(main="Typical heights of US women between 30-39 years",sub="heights relate to 15 typical women's weight",xlab="weight ",ylab="Height")

#Lets use our own datasets
browser <- c("Chrome", "Edge", "Firefox", "IE","Opera", "Safari", "Others")
users <- c(2502.4, 150.78, 395.83, 238.05, 86.49, 387.65, 134.8)
ib <- data.frame(browser, users)
ib

##   browser   users
## 1  Chrome 2502.40
## 2    Edge  150.78
## 3 Firefox  395.83
## 4      IE  238.05
## 5   Opera   86.49
## 6  Safari  387.65
## 7  Others  134.80

#Lets use the dataset for plotting
barplot(ib$users)

#Lets improve our plot
barplot(height = ib$users, 
        main = "2018 Internet Browser Users (in million)", 
        xlab = "Internet Browser", 
        ylab = "Users", 
        names.arg = ib$browser,
        border = "dark blue", 
        col = "orange")

#Lets arrange it in ascending order
barplot(height = ib$users[order(ib$users)], 
        main = "2018 Internet Browser Users (in million)", 
        xlab = "Internet Browser", 
        ylab = "Users", 
        names.arg = ib$browser,
        border = "dark blue", 
        col = "orange")

#Lets arrange it in descending order
barplot(height = ib$users[order(-ib$users)], 
        main = "2018 Internet Browser Users (in million)", 
        xlab = "Internet Browser", 
        ylab = "Users", 
        names.arg = ib$browser,
        border = "dark blue", 
        col = "orange")

qplot(iris$Species)#factor variable input syntax1

qplot(Species,data=iris)#factor variable input syntax2

qplot(gear,data=mtcars)#categorical variable as number input not aggregrated

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(factor(gear),data=mtcars)#Informing that the number is categorical

ggplot(iris,aes(Species))+geom_bar()#factor variable input

#add titles and other stuff as required
#qplot for frequency table is aggregate is not known to me


#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
ggplot(mtcars, aes(x=gear)) + geom_bar()#Nominal variable as number

ggplot(iris,aes(x=Species)) + geom_bar()#Nominal variable as string

#If you explicitly say stat = "identity" in geom_bar(), 
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values. 
#This mirrors the natural behavior of geom_col()

plant<-c("setosa","versicolor","virginica")
plantcount<-c(50,50,50)
irisfreqtab<-data.frame(plant,plantcount)#aggregated data
ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_bar(stat='identity')

#geom_col(), won't try to aggregate the data by default "geom_col() 
#uses stat_identity(): it leaves the data as is". 
#So, it expects you to already have the y values calculated and to use them directly

ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_col()#numeric vector as input

ggplot(women,aes(x=weight,y=height))+geom_col()#numeric vector as input

#-------------------------------------------------------------------------------------
#Barplot (#Horizontal barchart)
#-------------------------------------------------------------------------------------
barplot(table(iris$Species),horiz=T)#input table-character data
title(main="Barplot showing count of samples",sub="50 samples each of three iris plants",xlab="Category of iris plant",ylab="Number of samples taken")

barplot(women$height,names.arg = women$weight,horiz=T)#numeric vector
title(main="Typical heights of US women between 30-39 years",sub="heights relate to 15 typical women's weight",xlab="weight ",ylab="Height")

qplot(iris$Species)+coord_flip()#factor variable input

qplot(Species,data=iris)+coord_flip()#factor variable input

qplot(gear,data=mtcars)+coord_flip()#categorical variable as number input not aggregrated

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(factor(gear),data=mtcars)+coord_flip()#Informing that the number is categorical

ggplot(iris,aes(Species))+geom_bar()+coord_flip()#factor variable input

#add titles and other stuff as required
#qplot for frequency table is aggregate is not known to me


#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
ggplot(mtcars, aes(x=gear)) + geom_bar()+coord_flip()#Nominal variable as number

ggplot(iris,aes(x=Species)) + geom_bar()+coord_flip()#Nominal variable as string

#If you explicitly say stat = "identity" in geom_bar(), 
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values. 
#This mirrors the natural behavior of geom_col()

plant<-c("setosa","versicolor","virginica")
plantcount<-c(50,50,50)
irisfreqtab<-data.frame(plant,plantcount)#aggregated data
ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_bar(stat='identity')+coord_flip()

#geom_col(), won't try to aggregate the data by default "geom_col() 
#uses stat_identity(): it leaves the data as is". 
#So, it expects you to already have the y values calculated and to use them directly

ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_col()+coord_flip()#numeric vector as input

ggplot(women,aes(x=weight,y=height))+geom_col()+coord_flip()#numeric vector as input

#-------------------------------------------------------------------------------------
#Barplot (#stacked)#Stacked bar chart
#-------------------------------------------------------------------------------------

barplot(table(mtcars$gear,mtcars$cyl))#input table-character data cross tab

spineplot(table(mtcars$gear,mtcars$cyl))#100% stacked bar chart

barplot(table(mtcars$cyl,mtcars$gear))#input table-character data cross tab
title(main="Barplot showing number of cars as per their gear and cylinder capacity",sub="mtcars builtin dataset is used",xlab="gears",ylab="Number of cars")

barplot(USPersonalExpenditure)#input matrix dataset

m<-matrix(1:9,3,3)
barplot(m)#simpler example of matrix input

#qplot for stacked bar chart is not known to me
#If you have dataframe then use ggplot() as given below

#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
#If you explicitly say stat = "identity" in geom_bar(), 
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values. 
#This mirrors the natural behavior of geom_col()

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity")

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity",position = "stack")

#100% stacked bar chart
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat="identity",position="fill")

#geom_col(), won't try to aggregate the data by default "geom_col() 

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col()

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col()+coord_flip()

#-------------------------------------------------------------------------------------
#Barplot (#clustered)#Grouped or Clustered  bar chart
#-------------------------------------------------------------------------------------

barplot(table(mtcars$gear,mtcars$cyl),beside = T)#input table-character data cross tab
#barplot(table(mtcars$cyl,mtcars$gear,beside = T))#input table-character data cross tab
title(main="Barplot showing number of cars as per their gear and cylinder capacity",sub="mtcars builtin dataset is used",xlab="gears",ylab="Number of cars")

barplot(USPersonalExpenditure,beside = T)#input matrix dataset

m<-matrix(1:9,3,3)
barplot(m,beside=T)#simpler example of matrix input

#qplot for stacked bar chart is not known to me
#If you have dataframe then use ggplot() as given below

#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
#If you explicitly say stat = "identity" in geom_bar(), 
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values. 
#This mirrors the natural behavior of geom_col()

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity")#default stacked

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity",position = "dodge")

#100% stacked bar chart
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat="identity",position="fill")

#geom_col(), won't try to aggregate the data by default "geom_col() 

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col(position = "dodge")

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col(position = "dodge")+coord_flip()

#-------------------------------------------------------------------------------------
#pie chart 
#-------------------------------------------------------------------------------------
#pie(x, labels, radius, main, col, clockwise)

#Vector
pie(c(10,20,30,40))
x1<-c(10,20,30,40)
pie(x1)

#Vector with names the best way to plot for labels
names(x1)<-c("First","Second","Third","Fourth")
print(x1)

##  First Second  Third Fourth 
##     10     20     30     40

pie(x1)
title(main="Pie chart",sub="four slices",xlab="xlab nonsensical",ylab="ylab nonsensical")

#Using labels
lbl<-c("First","Second","Third","Fourth")
pie(x1,labels=lbl)

values <-  c(906, 264, 289, 339, 938)
countries <-  c("India","Sri Lanka","Nepal","Bhutan", "China")

pie(values, labels = countries)

#Using radius
pie(x1,radius = 1.6)#Radius takes - and + values but around 1 is suggested

#using main
pie(x1,main = "The pie chart")

#using colour
pie(x1,col="red")#Single colour do not use

clrs<-c("Red","Green","Blue","Orange")
pie(x1,col = clrs)

#using colour palette called rainbow
pie(x1,col=rainbow(length(x1)))

#Changing the direction of arrangement of slices
pie(x1,col=clrs,clockwise = T)

#chainging the starting point of the first slice
pie(x1,col=clrs,init.angle = 45)

#adding a legend
pie(x1,col=clrs)
legend("bottomright",lbl,fill=clrs)

#lets have percentages
# Pie Chart with Percentages
slices <- c(10, 12, 4, 16, 8)
lbls <- c("US", "UK", "Australia", "Germany", "France")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct,"%",sep = " ")
pie(slices,lbls)

# R Pie Chart - Border and lty Example

values <-  c(906, 264, 289, 339, 938)
countries <-  c("India","Sri Lanka","Nepal","Bhutan", "China")

pie(values, labels = countries, border = "red", lty = 2)

#TIP: To assign different border colors, we use a Vector of colors. For example, border = c("red", "green", "black".)


#Using a dataframe input

x2<-table(mtcars$gear)#Nominal ie categorical input that is a number
pie(x2)

x3<-table(iris$Species)#Nominal ie categorical input that is a number
pie(x3)

#Percentages in labels
percx2<-prop.table(table(mtcars$gear))
pie(x2,percx2)

percx2<-prop.table(table(mtcars$gear))*100
pie(x2,percx2)

percx2<-paste(names(prop.table(table(mtcars$gear))),prop.table(table(mtcars$gear))*100,"%",sep=" ")
pie(x2,percx2)

percx3<-paste(names(prop.table(table(iris$Species))),prop.table(table(iris$Species))*100,"%",sep=" ")
pie(x3,percx3)

percx3<-paste(names(prop.table(table(iris$Species))),round(prop.table(table(iris$Species))*100),"%",sep=" ")
pie(x3,percx3)

#ggplot pie chart geom_bar() + coord_polar().

#lets look at stacked bar single column
library(ggplot2)

ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+  geom_bar(stat = "identity")

pc1<-ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+  geom_bar(stat = "identity",width = 1)
pc1+coord_polar()

pc1+coord_polar("y")

ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+  geom_col()

pc2<-ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+  geom_col(width = 1)
pc2+coord_polar()#show without width arg and with width in the above line of code

pc2+coord_polar("y")

#-------------------------------------------------------------------------------------
#Histogram 
#-------------------------------------------------------------------------------------

#hist(iris$Species)#Error cant accept categorical vector
hist(mtcars$gear)#Nonsensical nominal

#Histogram is only done for numerical vector for categorical use barchart
hist(AirPassengers)#you can input a matrix

#hist(mtcars)#Error cannot input entire dataframe
hist(mtcars$mpg)#Only the numerical column of a data frame

hist(mtcars$mpg,main = "Histogram of Miles Per Gallon",sub="inbuilt mtcars dataset",xlab="Miles Per Gallon bins",ylab="Frequency or count of cars")

hist(mtcars$mpg,freq = T)#Watch y axis count

hist(mtcars$mpg,freq = F)#Watch y axis percentage or density

plot(density(mtcars$mpg))#Lets look at density curve, what it looks like

hist(mtcars$mpg,probability = T)#pdf
hist(mtcars$mpg,freq = F)#pdf
lines(density(mtcars$mpg))#add the curves

hist(mtcars$mpg)#Five bins are given go us by default

hist(mtcars$mpg,breaks = 3)#Asking for 3 bin breaks

hist(mtcars$mpg,breaks=c(10,15,20,25,35),freq=T)#custom bin widths with warning

## Warning in plot.histogram(r, freq = freq1, col = col, border = border, angle =
## angle, : the AREAS in the plot are wrong -- rather use 'freq = FALSE'

hist(mtcars$mpg,breaks=c(10,15,20,25,35),freq=F)#custom bin widths see maa no warning

#Now for make up
hist(mtcars$mpg,col="orange",border = "blue")

#return values of hist function
h<-hist(mtcars$mpg)
print(h)

## $breaks
## [1] 10 15 20 25 30 35
## 
## $counts
## [1]  6 12  8  2  4
## 
## $density
## [1] 0.0375 0.0750 0.0500 0.0125 0.0250
## 
## $mids
## [1] 12.5 17.5 22.5 27.5 32.5
## 
## $xname
## [1] "mtcars$mpg"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

text(h$mids,h$counts,labels=h$counts, adj=c(0.5, -0.5))# add labels

#qplot for histogram
qplot(mtcars$mpg)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(mpg, data = mtcars, geom = "histogram")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(mtcars$mpg,binwidth=5)

qplot(mtcars$mpg,binwidth=5,main="Histogram of Miles Per Gallon",xlab="Miles Per Gallon bins",ylab="Number of cars")

qplot(mpg, data = mtcars, geom = "density")

#ggplot for histogram
ggplot(mtcars, aes(x=mpg)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=5)

ggplot(mtcars, aes(x=mpg)) + geom_density()#add titles xlab etc please

ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=5,color="black",fill="white")

#-------------------------------------------------------------------------------------
#Boxplot 
#-------------------------------------------------------------------------------------

#boxplot(x, data, notch, varwidth, names, main)
#notch is a logical value. Set as TRUE to draw a notch.
#varwidth is a logical value. Set as true to draw width of the box proportionate to the sample size
#names are the group labels which will be printed under each boxplot.


boxplot(mtcars$mpg)

boxplot(mtcars$mpg,notch = T)

#using x,y argument
x <- "1"
y <- rnorm(100)
qplot(x, y, geom="boxplot")

qplot(group, weight, data = PlantGrowth,geom=c("boxplot"))

# Formula syntax
boxplot(mpg ~ cyl, data = mtcars)

# Put interaction of two variables on x-axis
boxplot(mpg ~ cyl+am, data = mtcars)

#mtcars 
boxplot(mpg ~ cyl, data = mtcars, xlab = "Number of Cylinders", ylab = "Miles Per Gallon", main = "Mileage Data")

qplot(cyl, mpg, data=mtcars, geom="boxplot")#with warnings

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

qplot(cyl, mpg, data=mtcars, group=cyl, geom="boxplot")

# This is equivalent to:
ggplot(mtcars, aes(x=cyl, y=mpg)) + geom_boxplot()#with warnings

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ggplot(mtcars, aes(x=cyl, y=mpg,group=cyl)) + geom_boxplot()

Data Visualization basic graphics vs ggplot2

Dr N Srikanth Reddy

16/06/2021