####DATA VISUALISATION USING ggplot2 PACKAGE IN R####
#OVERVIEW
#ggplot2 is a "grammar of graphics" which enable us to make graphs/plots
#using three basic components:-
#1. Data
#2. geom: visual marks which represents data points. Includes aesthestic (aes) properties such as size, color, etc
#3. coordinate system
#*Note*: mtcars dataset has been used in this code for illustrations.
#*STRUCTURE OF THE CODE*
#1. generic codes
#2. Visualisation of one variable (X)
#2.1 Continous variable
#2.2 Discrete variable/ Categorical variable
#3. Visualisation of two variables (X and Y)
#3a. Both continous variables (Continous X, Continous Y)
#3b. One discrete and one continous variable (Discrete X, COntinous Y)
#3c. Both discrete variables (Discrete X, Discrete Y)
#4. Visualisation of three variables
#5. Position adjustments
#6. Facet plots
#7. Labels
#8. Legends
#9. Zooming
#load library
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#### 1. Generic codes for data visualisation using ggplot2:-####
#ggplot(data, aes(x = .., y = ..)) +
# fun (aes(color = ...)) + #adding layers with layer specific mappings (can be multiple)
# fun ()) #additional elements in the plot. example theme_bw()
#To see the last plot
#last_plot()
#To save the plot
#ggsave("name_plot.png", width = .., height = ..)
#### 2. Visualisation of one variable (X)####
# 2.1 One continous variable
# visualisation of one continous variable can be done using following functions:-
#histogram (geom_histogram),
#frequency polygon (geom_freqpoly)
#dotplot (geom_dotplot)
#smoothened histogram (geom_density)
#Its a good habit to create and save first part of the generic code
#To create first part of the generic code and saving it
#ggplot(data, aes(x = .., y = ..))
f = ggplot(mtcars, aes(x = mpg))
#To create a histogram
#divide the x axis into intervals using "binwidth" or "bins" function
#binwidth fun is used to decide the width of each interval
#bin fun is used to decide the number of class intervals
# To create histogram using "binwidth" fun..
f + geom_histogram(binwidth = 3)

#to create histogram using "bins" fun
f+ geom_histogram(bins = 8)

#Optional arguments while creating a histogram are
#color..of lines
#linetype (0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash)
#size (in mm)
#fill.. color of the bar
#alpha.. to add transparency.. varies from 0 (transparent) to 1(opaque)
#na.rm = FALSE, gives a warning if any missing values are omitted while making graph; TRUE doesnt provide warning
f+ geom_histogram(bins = 8, color = "blue", linetype = 2, size = 2, fill = "green", alpha = 0.5, na.rm = FALSE)

#To create a frequency polygon
#binwidth and bin fun same as histogram
f+geom_freqpoly(bins = 8)

#Optional arguments
#color of the line, linetype, alpha, size, na.rm
f +geom_freqpoly(bins = 8, color = "purple", linetype = 2, alpha = 1, size = 1.5, na.rm = FALSE)

#To create a dotplot
#arguments.. color, fill, alpha
f + geom_dotplot(color = "black", fill = "cyan", alpha = 1)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#To create a smoothened version of histogram
#kernel density functions can be used (eg gaussian (default), cosine, biweight, etc)
#arguments.. color, linetype, size, fill, alpha
f+geom_density(kernel = "gaussian", color = "black", linetype = 3, size = 2, fill = "violet", alpha = 0.5)

#NOTE.. multiple graphs can be plotted together by adding layers.
#Example..
#Lets save graphs first for easy use
histo = geom_histogram(bins = 8, color = "blue", linetype = 1, size = 2, fill = "violet", alpha = 1, na.rm = FALSE)
poly = geom_freqpoly(bins = 8, color = "black", linetype = 5, alpha = 1, size = 2, na.rm = FALSE)
#Now to create a single graph with multiple layers
f + histo + poly

#2.2 Visualisation of discrete variable/ categorical variable
#visualisation of a categorical variable can be done by plotting
#bar charts
#pie charts.. As per literature, pie charts are considered a bad way of displaying information because
#eye is not good at judging relative areas as compared to linear measures (dot chart/ bar chart)..
#So lets first decide a categorical variable
mtc = mutate(mtcars, am1 = as.factor(mtcars$am))
#Creating and saving first part of the generic code
#ggplot(data, aes(x = .., y = ..))
g = ggplot(mtc, aes(x= am1)) #am is automatic = 1 and manual = 0
#To create a barchart
#arguments.. colour, linetype, size, fill, alpha
g + geom_bar(color = "red", linetype = 1, size = 0.5, fill = "cyan", alpha = 1)

####3. Visualisation of two variables (X and Y)####
#3a. Both continous variables (Continous X, Continous Y)
#In mtcars data mpg and disp are two continous variables
#Visualisation options with two continous variables include
# Scatter plot (geom_point)
# Scatter plot for overplotted graphs (geom_jitter)
# Plotting regression line (geom_smooth)
# Quantile regression lines (geom_quantile)
#creating first part of the code
#ggplot (data, aes(x = .., y = ..))
h = ggplot(mtcars, aes(x = mpg, y = disp))
#Scatter plot
#It can also be used for one categorical, or both categorical variables, but
# variations such as geom_jitter(), geom_bin2d() and geom_count() are more appropriate.
#Arguments
#color, alpha, fill,
#stroke.. to modify width of the border (among shapes with a border. eg 24)
#size.. important especially when there is overplotting (overlapping points)
#shape = 0, square; 1, circle; 2, triangle point up; 3, plus; 4, cross
#shape = 5, diamond; 6, triangle point down; 7, square cross; 8, star
#shape = 9, diamond plus; 10, circle plus; 11, triangles up and down
#shape = 12, square plus; 13, circle cross; 14, square and triangle down
#shape = 15, filled square; 16, filled circle; 17, filled triangle point-up
#shape = 18, filled diamond; 19, solid circle; 20, bullet (smaller circle)
#shape = 21, filled circle blue; 22, filled sq; 22, filled square blue
#shape = 23, filled diamond blue; 24, filled triangle point-up blue; 25, filled triangle point down blue
#na.rm = FALSE (removes missing values with a warning)
h + geom_point(color = "blue", alpha =1, stroke = 1.5, fill = "red", size = 2, shape = 24, na.rm = FALSE)

#jitter geom is usually plotted when overplotting is seen on a scatter plot
#it adds a random variation to the position of each point so that they are no longer overlapping
#by default, despite variation, 80% of the implied position will remain same
#to change positive/ negative jitter, height (vertical jitter) and width (horizontal jitter) can be used
h + geom_jitter(color = "blue", alpha =1, stroke = 1.5, fill = "red", size = 2, shape = 24, na.rm = FALSE, height = 0.8, width = 0.8)

#To add regression line
#By default, geom_smooth uses "smoothened conditional means" to plot a line
#To plot a regression line using linear model, add argument "method = lm"
#arguments = color, linetype, size, fill, alpha, weight,
#se = TRUE by default, display confidence interval across line
#level = level of confidence for plotting "se". 0.95 by default
#na.rm = FALSE removes missing values with warning
h + geom_smooth(method = lm, color = "blue", linetype = 2,
size = 2, fill = "red", alpha = 0.8,
se = TRUE, level = 0.95, na.rm = FALSE)

#Quantile regression
#Quantile regression models the effect of a predictor variable on specific quantiles of response (dependent) variable
#Quantiles = conditional quantiles of y; by default (0.25, 0.5 and 0.75)
#color, linetype, size,
#lineend = round/butt/square... line end style
#na.rm = FALSE
h + geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linetype = 3, size = 2, lineend = "round", na.rm = FALSE)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Smoothing formula not specified. Using: y ~ x

#3b. One discrete and one continous variable (Discrete X, COntinous Y)
#no of cylinders "cyl" is discrete variable in "mtcars" dataset but is stored as numeric
#converting cyl variable as double
mt = mutate(mtcars, cyl1 = as.factor(mtcars$cyl))
summary(mt)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb cyl1
## Min. :0.0000 Min. :3.000 Min. :1.000 4:11
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000 6: 7
## Median :0.0000 Median :4.000 Median :2.000 8:14
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
# weight of the car "wt" is a continous variable
#Visualisation options
#barchart (geom_bar)
#boxplot (geom_boxplot)
#dotplot (geom_dotplot)
#Violin plot (geom_violin)
#creating first part of the code
a = ggplot(mt, aes(x = cyl1, y = wt))
#creating a barchart
#remember... total sum of y variable in x categories are calculated
#stat function maps the variables computed by the aes in the first part of the code
a + geom_bar(stat = "identity", color = "red", linetype = 4, size = 0.1, fill = "blue", alpha = 0.5)

#Creating a boxplot
#Gives 5 summary statistics: Q1, Q2, Q3, upper and lower extreme values
#outlier.color , shape, fill , alpha, stroke
#varwidth = TRUE (box width proportional to sqrt of n in each group)
#na.rm
#coef = length of whisker (By default = 1.5IQR)
a + geom_boxplot(outlier.color = "red", varwidth = TRUE, na.rm = FALSE, coef = 1.5)

#Creating a dotplot
#mention binaxis
#color, fill, alpha
a +geom_dotplot(binaxis = "y", color = "red", fill = "yellow")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#Creating a violin plot
#Violin plots are compact representations...blend of boxplot and densityplot for continous distributions
#color, linetype, fill , alpha
a + geom_violin(color = "red", fill = "cyan")

#3c. Both discrete variables (Discrete X, Discrete Y)
#Jitter plot
####4. Visulisation of three variables####
#factor.. to give different color according to third variable (categorical)
h + geom_point(aes(color = factor(cyl)))

####5. Position adjustments####
#These are done to represent geoms which occupy same space on a plot
#First part of the code
mtc1 = mutate(mtc, cyl1 = as.factor(mtc$cyl))
k = ggplot(mtc1, aes(x=am, fill = cyl1))
#To make multiple bar
k + geom_bar(position = "dodge")

#to make component bar with percentages
k + geom_bar(position = "fill")

#to make component bar with numbers
k + geom_bar(position = "stack")

####6. Facet plots####
#facet_grid forms the matrix of panels according to row/ column variables
#lets take a previously coded scatter plot from section 3 of this code
#creating first part of the code
#ggplot (data, aes(x = .., y = ..))
h = ggplot(mtcars, aes(x = mpg, y = disp))
#Scatter plot
scatter = h + geom_point(color = "blue", alpha =1, stroke = 1.5, fill = "red", size = 2, shape = 24, na.rm = FALSE)
# To make faceted plots
#add layer facet_grid(row variable~ . ~column variable)
#only column wise
scatter + facet_grid(.~cyl)

#only row wise
scatter + facet_grid(cyl~.)

#both row and column wise
scatter + facet_grid(am~.~cyl)

####7. Labels####
#To give main title.. on the top
#subtitle.. for the plot
#x axis and y axis labels
#caption.. placed at bottom right (used for sources, notes, copyright)
#tag.. at top left (usually to tag a figure with a reference letter, etc)
#Creating "m" as First and second part of the code
m = h + geom_point(aes(color = factor(cyl)))
#adding labels
n = m + labs(title = "This is title", subtitle = "Sub", caption = "caption", x = "This is X axis", y = "This is Y axis")
n

####8. Legends####
#legend.position = "right"/ "left"/ "bottom"/ "top"
n +theme(legend.position = "bottom")

####9. Zooming####
#To zoom into a specific area using xlim and ylim functions
n + coord_cartesian(
xlim = c(10, 20), ylim = c(0, 300))
