#Data Visualization basic graphics vs ggplot2
library(ggplot2)
#-------------------------------------------------------------------------------------
#Scatter plot
#-------------------------------------------------------------------------------------
#The BOD data frame has 6 rows and 2 columns giving
#the biochemical oxygen demand versus time in an evaluation of water quality.
print(BOD)
## Time demand
## 1 1 8.3
## 2 2 10.3
## 3 3 19.0
## 4 4 16.0
## 5 5 15.6
## 6 7 19.8
str(BOD)
## 'data.frame': 6 obs. of 2 variables:
## $ Time : num 1 2 3 4 5 7
## $ demand: num 8.3 10.3 19 16 15.6 19.8
## - attr(*, "reference")= chr "A1.4, p. 270"
plot(BOD)

plot(BOD$demand,BOD$Time)

plot(BOD,ann=F)
title(main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")
plot(BOD,main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")

#we can store plots into variables
pz<-plot(BOD)

qplot(BOD$Time,BOD$demand,geom="point")

quickplot(BOD$Time,BOD$demand,geom="point")

qplot(Time,demand,data=BOD,geom="point")

qplot(Time,demand,data=BOD,geom="point",main="Biochemical Oxygen Demand",xlab="Time in Days",ylab="Oxygen Demand")

ggplot(BOD)#This is the data layer only datasets ie dataframe

ggplot(BOD)+aes(Time,demand)#Add coordinate aesthetics

ggplot(BOD)+aes(Time,demand)+geom_point()#Add marker geometry

p1<-ggplot(BOD)+aes(Time,demand)#Save to variable
p1+geom_point()#Do the math ie add

p1+geom_area()#easily change and it will be short ie reader friendly

ggplot(data=BOD,aes(Time,demand))+geom_point()#convention1 recommended

ggplot(data=BOD)+geom_point(aes(Time,demand))#convention2 can also be used

p2<-ggplot(data=BOD,aes(Time,demand))+geom_point()
print(p2)

p2+ggtitle(label="Biochemical Oxygen Demand",subtitle="Water Quality Test")+xlab("Time in Days")+ylab("Oxygen Demand")

p2+labs(title="Biochemical Oxygen Demand",subtitle = "Water Quality Test",x="Time in Days",y="Oxygen Demand",caption = "Data Source:caption",tag="tag: subplot 1")

#-------------------------------------------------------------------------------------
#Line plot ie scatterplot with line
#-------------------------------------------------------------------------------------
plot(BOD,type="l")
title(main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")

plot(BOD,type="l",main="Biochemical Oxygen Demand",sub="Water Quality Test",xlab="Time in Days",ylab="Oxygen Demand")

qplot(BOD$Time,BOD$demand,geom="line")

qplot(Time,demand,data=BOD,geom="line")

qplot(Time,demand,data=BOD,geom="line",main="Biochemical Oxygen Demand",xlab="Time in Days",ylab="Oxygen Demand")

ggplot(data=BOD,aes(Time,demand))+geom_line()

p1<-ggplot(data=BOD,aes(Time,demand))+geom_line()
p1+ggtitle(label="Biochemical Oxygen Demand",subtitle="Water Quality Test")+xlab("Time in Days")+ylab("Oxygen Demand")

p1+labs(title="Biochemical Oxygen Demand",subtitle = "Water Quality Test",x="Time in Days",y="Oxygen Demand",caption = "Data Source:caption",tag="tag: subplot 1")

#-------------------------------------------------------------------------------------
#Barplot (#Column or Vertical barchart)
#-------------------------------------------------------------------------------------
plot(iris$Species)#input factor-character data
title(main="Barplot showing count of samples",sub="50 samples each of three iris plants",xlab="Category of iris plant",ylab="Number of samples taken")

barplot(table(iris$Species))#input table-character data
title(main="Barplot showing count of samples",sub="50 samples each of three iris plants",xlab="Category of iris plant",ylab="Number of samples taken")

barplot(women$height,names.arg = women$weight)#numeric vector
title(main="Typical heights of US women between 30-39 years",sub="heights relate to 15 typical women's weight",xlab="weight ",ylab="Height")

#Lets use our own datasets
browser <- c("Chrome", "Edge", "Firefox", "IE","Opera", "Safari", "Others")
users <- c(2502.4, 150.78, 395.83, 238.05, 86.49, 387.65, 134.8)
ib <- data.frame(browser, users)
ib
## browser users
## 1 Chrome 2502.40
## 2 Edge 150.78
## 3 Firefox 395.83
## 4 IE 238.05
## 5 Opera 86.49
## 6 Safari 387.65
## 7 Others 134.80
#Lets use the dataset for plotting
barplot(ib$users)

#Lets improve our plot
barplot(height = ib$users,
main = "2018 Internet Browser Users (in million)",
xlab = "Internet Browser",
ylab = "Users",
names.arg = ib$browser,
border = "dark blue",
col = "orange")

#Lets arrange it in ascending order
barplot(height = ib$users[order(ib$users)],
main = "2018 Internet Browser Users (in million)",
xlab = "Internet Browser",
ylab = "Users",
names.arg = ib$browser,
border = "dark blue",
col = "orange")

#Lets arrange it in descending order
barplot(height = ib$users[order(-ib$users)],
main = "2018 Internet Browser Users (in million)",
xlab = "Internet Browser",
ylab = "Users",
names.arg = ib$browser,
border = "dark blue",
col = "orange")

qplot(iris$Species)#factor variable input syntax1

qplot(Species,data=iris)#factor variable input syntax2

qplot(gear,data=mtcars)#categorical variable as number input not aggregrated
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(factor(gear),data=mtcars)#Informing that the number is categorical

ggplot(iris,aes(Species))+geom_bar()#factor variable input

#add titles and other stuff as required
#qplot for frequency table is aggregate is not known to me
#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
ggplot(mtcars, aes(x=gear)) + geom_bar()#Nominal variable as number

ggplot(iris,aes(x=Species)) + geom_bar()#Nominal variable as string

#If you explicitly say stat = "identity" in geom_bar(),
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values.
#This mirrors the natural behavior of geom_col()
plant<-c("setosa","versicolor","virginica")
plantcount<-c(50,50,50)
irisfreqtab<-data.frame(plant,plantcount)#aggregated data
ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_bar(stat='identity')

#geom_col(), won't try to aggregate the data by default "geom_col()
#uses stat_identity(): it leaves the data as is".
#So, it expects you to already have the y values calculated and to use them directly
ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_col()#numeric vector as input

ggplot(women,aes(x=weight,y=height))+geom_col()#numeric vector as input

#-------------------------------------------------------------------------------------
#Barplot (#Horizontal barchart)
#-------------------------------------------------------------------------------------
barplot(table(iris$Species),horiz=T)#input table-character data
title(main="Barplot showing count of samples",sub="50 samples each of three iris plants",xlab="Category of iris plant",ylab="Number of samples taken")

barplot(women$height,names.arg = women$weight,horiz=T)#numeric vector
title(main="Typical heights of US women between 30-39 years",sub="heights relate to 15 typical women's weight",xlab="weight ",ylab="Height")

qplot(iris$Species)+coord_flip()#factor variable input

qplot(Species,data=iris)+coord_flip()#factor variable input

qplot(gear,data=mtcars)+coord_flip()#categorical variable as number input not aggregrated
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(factor(gear),data=mtcars)+coord_flip()#Informing that the number is categorical

ggplot(iris,aes(Species))+geom_bar()+coord_flip()#factor variable input

#add titles and other stuff as required
#qplot for frequency table is aggregate is not known to me
#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
ggplot(mtcars, aes(x=gear)) + geom_bar()+coord_flip()#Nominal variable as number

ggplot(iris,aes(x=Species)) + geom_bar()+coord_flip()#Nominal variable as string

#If you explicitly say stat = "identity" in geom_bar(),
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values.
#This mirrors the natural behavior of geom_col()
plant<-c("setosa","versicolor","virginica")
plantcount<-c(50,50,50)
irisfreqtab<-data.frame(plant,plantcount)#aggregated data
ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_bar(stat='identity')+coord_flip()

#geom_col(), won't try to aggregate the data by default "geom_col()
#uses stat_identity(): it leaves the data as is".
#So, it expects you to already have the y values calculated and to use them directly
ggplot(irisfreqtab, aes(x=plant, y= plantcount)) + geom_col()+coord_flip()#numeric vector as input

ggplot(women,aes(x=weight,y=height))+geom_col()+coord_flip()#numeric vector as input

#-------------------------------------------------------------------------------------
#Barplot (#stacked)#Stacked bar chart
#-------------------------------------------------------------------------------------
barplot(table(mtcars$gear,mtcars$cyl))#input table-character data cross tab

spineplot(table(mtcars$gear,mtcars$cyl))#100% stacked bar chart

barplot(table(mtcars$cyl,mtcars$gear))#input table-character data cross tab
title(main="Barplot showing number of cars as per their gear and cylinder capacity",sub="mtcars builtin dataset is used",xlab="gears",ylab="Number of cars")

barplot(USPersonalExpenditure)#input matrix dataset

m<-matrix(1:9,3,3)
barplot(m)#simpler example of matrix input

#qplot for stacked bar chart is not known to me
#If you have dataframe then use ggplot() as given below
#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
#If you explicitly say stat = "identity" in geom_bar(),
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values.
#This mirrors the natural behavior of geom_col()
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity")

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity",position = "stack")

#100% stacked bar chart
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat="identity",position="fill")

#geom_col(), won't try to aggregate the data by default "geom_col()
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col()

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col()+coord_flip()

#-------------------------------------------------------------------------------------
#Barplot (#clustered)#Grouped or Clustered bar chart
#-------------------------------------------------------------------------------------
barplot(table(mtcars$gear,mtcars$cyl),beside = T)#input table-character data cross tab
#barplot(table(mtcars$cyl,mtcars$gear,beside = T))#input table-character data cross tab
title(main="Barplot showing number of cars as per their gear and cylinder capacity",sub="mtcars builtin dataset is used",xlab="gears",ylab="Number of cars")

barplot(USPersonalExpenditure,beside = T)#input matrix dataset

m<-matrix(1:9,3,3)
barplot(m,beside=T)#simpler example of matrix input

#qplot for stacked bar chart is not known to me
#If you have dataframe then use ggplot() as given below
#geom_bar(), the default behavior is to count the rows for each x value. It doesn't expect a y-value,
#How aggregation is to be performed is specified as an argument to geom_bar(), which is stat = "count" for the default value.
#If you explicitly say stat = "identity" in geom_bar(),
#you're telling ggplot2 to skip the aggregation and that you'll provide the y values.
#This mirrors the natural behavior of geom_col()
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity")#default stacked

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat = "identity",position = "dodge")

#100% stacked bar chart
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_bar(stat="identity",position="fill")

#geom_col(), won't try to aggregate the data by default "geom_col()
ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col(position = "dodge")

ggplot(warpbreaks, aes(x=wool,y=breaks, fill=tension)) + geom_col(position = "dodge")+coord_flip()

#-------------------------------------------------------------------------------------
#pie chart
#-------------------------------------------------------------------------------------
#pie(x, labels, radius, main, col, clockwise)
#Vector
pie(c(10,20,30,40))
x1<-c(10,20,30,40)
pie(x1)

#Vector with names the best way to plot for labels
names(x1)<-c("First","Second","Third","Fourth")
print(x1)
## First Second Third Fourth
## 10 20 30 40
pie(x1)
title(main="Pie chart",sub="four slices",xlab="xlab nonsensical",ylab="ylab nonsensical")

#Using labels
lbl<-c("First","Second","Third","Fourth")
pie(x1,labels=lbl)

values <- c(906, 264, 289, 339, 938)
countries <- c("India","Sri Lanka","Nepal","Bhutan", "China")
pie(values, labels = countries)

#Using radius
pie(x1,radius = 1.6)#Radius takes - and + values but around 1 is suggested

#using main
pie(x1,main = "The pie chart")

#using colour
pie(x1,col="red")#Single colour do not use

clrs<-c("Red","Green","Blue","Orange")
pie(x1,col = clrs)

#using colour palette called rainbow
pie(x1,col=rainbow(length(x1)))

#Changing the direction of arrangement of slices
pie(x1,col=clrs,clockwise = T)

#chainging the starting point of the first slice
pie(x1,col=clrs,init.angle = 45)

#adding a legend
pie(x1,col=clrs)
legend("bottomright",lbl,fill=clrs)

#lets have percentages
# Pie Chart with Percentages
slices <- c(10, 12, 4, 16, 8)
lbls <- c("US", "UK", "Australia", "Germany", "France")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct,"%",sep = " ")
pie(slices,lbls)

# R Pie Chart - Border and lty Example
values <- c(906, 264, 289, 339, 938)
countries <- c("India","Sri Lanka","Nepal","Bhutan", "China")
pie(values, labels = countries, border = "red", lty = 2)

#TIP: To assign different border colors, we use a Vector of colors. For example, border = c("red", "green", "black".)
#Using a dataframe input
x2<-table(mtcars$gear)#Nominal ie categorical input that is a number
pie(x2)

x3<-table(iris$Species)#Nominal ie categorical input that is a number
pie(x3)

#Percentages in labels
percx2<-prop.table(table(mtcars$gear))
pie(x2,percx2)

percx2<-prop.table(table(mtcars$gear))*100
pie(x2,percx2)

percx2<-paste(names(prop.table(table(mtcars$gear))),prop.table(table(mtcars$gear))*100,"%",sep=" ")
pie(x2,percx2)

percx3<-paste(names(prop.table(table(iris$Species))),prop.table(table(iris$Species))*100,"%",sep=" ")
pie(x3,percx3)

percx3<-paste(names(prop.table(table(iris$Species))),round(prop.table(table(iris$Species))*100),"%",sep=" ")
pie(x3,percx3)

#ggplot pie chart geom_bar() + coord_polar().
#lets look at stacked bar single column
library(ggplot2)
ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+ geom_bar(stat = "identity")

pc1<-ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+ geom_bar(stat = "identity",width = 1)
pc1+coord_polar()

pc1+coord_polar("y")

ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+ geom_col()

pc2<-ggplot(warpbreaks, aes(x="", y=breaks, fill=tension))+ geom_col(width = 1)
pc2+coord_polar()#show without width arg and with width in the above line of code

pc2+coord_polar("y")

#-------------------------------------------------------------------------------------
#Histogram
#-------------------------------------------------------------------------------------
#hist(iris$Species)#Error cant accept categorical vector
hist(mtcars$gear)#Nonsensical nominal

#Histogram is only done for numerical vector for categorical use barchart
hist(AirPassengers)#you can input a matrix

#hist(mtcars)#Error cannot input entire dataframe
hist(mtcars$mpg)#Only the numerical column of a data frame

hist(mtcars$mpg,main = "Histogram of Miles Per Gallon",sub="inbuilt mtcars dataset",xlab="Miles Per Gallon bins",ylab="Frequency or count of cars")

hist(mtcars$mpg,freq = T)#Watch y axis count

hist(mtcars$mpg,freq = F)#Watch y axis percentage or density

plot(density(mtcars$mpg))#Lets look at density curve, what it looks like

hist(mtcars$mpg,probability = T)#pdf
hist(mtcars$mpg,freq = F)#pdf
lines(density(mtcars$mpg))#add the curves

hist(mtcars$mpg)#Five bins are given go us by default

hist(mtcars$mpg,breaks = 3)#Asking for 3 bin breaks

hist(mtcars$mpg,breaks=c(10,15,20,25,35),freq=T)#custom bin widths with warning
## Warning in plot.histogram(r, freq = freq1, col = col, border = border, angle =
## angle, : the AREAS in the plot are wrong -- rather use 'freq = FALSE'

hist(mtcars$mpg,breaks=c(10,15,20,25,35),freq=F)#custom bin widths see maa no warning

#Now for make up
hist(mtcars$mpg,col="orange",border = "blue")

#return values of hist function
h<-hist(mtcars$mpg)
print(h)
## $breaks
## [1] 10 15 20 25 30 35
##
## $counts
## [1] 6 12 8 2 4
##
## $density
## [1] 0.0375 0.0750 0.0500 0.0125 0.0250
##
## $mids
## [1] 12.5 17.5 22.5 27.5 32.5
##
## $xname
## [1] "mtcars$mpg"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
text(h$mids,h$counts,labels=h$counts, adj=c(0.5, -0.5))# add labels

#qplot for histogram
qplot(mtcars$mpg)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(mpg, data = mtcars, geom = "histogram")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(mtcars$mpg,binwidth=5)

qplot(mtcars$mpg,binwidth=5,main="Histogram of Miles Per Gallon",xlab="Miles Per Gallon bins",ylab="Number of cars")

qplot(mpg, data = mtcars, geom = "density")

#ggplot for histogram
ggplot(mtcars, aes(x=mpg)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=5)

ggplot(mtcars, aes(x=mpg)) + geom_density()#add titles xlab etc please

ggplot(mtcars, aes(x=mpg)) + geom_histogram(binwidth=5,color="black",fill="white")

#-------------------------------------------------------------------------------------
#Boxplot
#-------------------------------------------------------------------------------------
#boxplot(x, data, notch, varwidth, names, main)
#notch is a logical value. Set as TRUE to draw a notch.
#varwidth is a logical value. Set as true to draw width of the box proportionate to the sample size
#names are the group labels which will be printed under each boxplot.
boxplot(mtcars$mpg)

boxplot(mtcars$mpg,notch = T)

#using x,y argument
x <- "1"
y <- rnorm(100)
qplot(x, y, geom="boxplot")

qplot(group, weight, data = PlantGrowth,geom=c("boxplot"))

# Formula syntax
boxplot(mpg ~ cyl, data = mtcars)

# Put interaction of two variables on x-axis
boxplot(mpg ~ cyl+am, data = mtcars)

#mtcars
boxplot(mpg ~ cyl, data = mtcars, xlab = "Number of Cylinders", ylab = "Miles Per Gallon", main = "Mileage Data")

qplot(cyl, mpg, data=mtcars, geom="boxplot")#with warnings
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

qplot(cyl, mpg, data=mtcars, group=cyl, geom="boxplot")

# This is equivalent to:
ggplot(mtcars, aes(x=cyl, y=mpg)) + geom_boxplot()#with warnings
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ggplot(mtcars, aes(x=cyl, y=mpg,group=cyl)) + geom_boxplot()
