####DATA VISUALISATION USING ggplot2 PACKAGE IN R####

#OVERVIEW

#ggplot2 is a "grammar of graphics" which enable us to make graphs/plots
#using three basic components:-
#1. Data 
#2. geom: visual marks which represents data points. Includes aesthestic (aes) properties such as size, color, etc
#3. coordinate system

#*Note*: mtcars dataset has been used in this code for illustrations.

#*STRUCTURE OF THE CODE*

#1. generic codes

#2. Visualisation of one  variable (X)
#2.1 Continous variable 
#2.2 Discrete variable/ Categorical variable

#3. Visualisation of two variables (X and Y)
#3a. Both continous variables (Continous X, Continous Y)
#3b. One discrete and one continous variable (Discrete X, COntinous Y)
#3c. Both discrete variables (Discrete X, Discrete Y)


#4. Visualisation of three variables

#5. Position adjustments

#6. Facet plots

#7. Labels

#8. Legends

#9. Zooming

#load library
library(tidyverse)
## -- Attaching packages ----------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## -- Conflicts -------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#### 1. Generic codes for data visualisation using ggplot2:-####

#ggplot(data, aes(x = .., y = ..)) + 
# fun (aes(color = ...)) + #adding layers with layer specific mappings (can be multiple)
#  fun ()) #additional elements in the plot. example theme_bw()

#To see the last plot
#last_plot()

#To save the plot
#ggsave("name_plot.png", width = .., height = ..)



#### 2. Visualisation of one variable (X)####

# 2.1 One continous variable

# visualisation of one continous variable can be done using following functions:-
#histogram (geom_histogram), 
#frequency polygon (geom_freqpoly)
#dotplot (geom_dotplot)
#smoothened histogram (geom_density)


#Its a good habit to create and save first part of the generic code
#To create first part of the generic code and saving it 
#ggplot(data, aes(x = .., y = ..))
f = ggplot(mtcars, aes(x = mpg))


#To create a histogram
#divide the x axis into intervals using "binwidth" or "bins" function
#binwidth fun is used to decide the width of each interval
#bin fun is used to decide the number of class intervals
# To create histogram using "binwidth" fun..
f + geom_histogram(binwidth = 3)  

#to create histogram using "bins" fun
f+ geom_histogram(bins = 8)

#Optional arguments while creating a histogram are 
#color..of lines
#linetype (0 = blank, 1 = solid, 2 = dashed, 3 = dotted, 4 = dotdash, 5 = longdash, 6 = twodash)
#size (in mm)
#fill.. color of the bar 
#alpha.. to add transparency.. varies from 0 (transparent) to 1(opaque)
#na.rm = FALSE, gives a warning if any missing values are omitted while making graph; TRUE doesnt provide warning
f+ geom_histogram(bins = 8, color = "blue", linetype = 2, size = 2, fill = "green", alpha = 0.5, na.rm = FALSE)

#To create a frequency polygon
#binwidth and bin fun same as histogram
f+geom_freqpoly(bins = 8)

#Optional arguments
#color of the line, linetype, alpha, size, na.rm 
f +geom_freqpoly(bins = 8, color = "purple", linetype = 2, alpha = 1, size = 1.5, na.rm = FALSE)

#To create a dotplot
#arguments.. color, fill, alpha
f + geom_dotplot(color = "black", fill = "cyan", alpha = 1)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#To create a smoothened version of histogram
#kernel density functions can be used (eg gaussian (default), cosine, biweight, etc)
#arguments.. color, linetype, size, fill, alpha
f+geom_density(kernel = "gaussian", color = "black", linetype = 3, size = 2, fill = "violet", alpha = 0.5)

#NOTE.. multiple graphs can be plotted together by adding layers.
#Example..
#Lets save graphs first for easy use
histo = geom_histogram(bins = 8, color = "blue", linetype = 1, size = 2, fill = "violet", alpha = 1, na.rm = FALSE)
poly = geom_freqpoly(bins = 8, color = "black", linetype = 5, alpha = 1, size = 2, na.rm = FALSE)
#Now to create a single graph with multiple layers
f + histo + poly

#2.2 Visualisation of discrete variable/ categorical variable
#visualisation of a categorical variable can be done by plotting 
#bar charts

#pie charts.. As per literature, pie charts are considered a bad way of displaying information because 
#eye is not good at judging relative areas as compared to linear measures (dot chart/ bar chart).. 

#So lets first decide a categorical variable
mtc = mutate(mtcars, am1 = as.factor(mtcars$am))

#Creating and saving first part of the generic code
#ggplot(data, aes(x = .., y = ..))
g = ggplot(mtc, aes(x= am1)) #am is automatic = 1 and manual = 0 

#To create a barchart
#arguments.. colour, linetype, size, fill, alpha
g + geom_bar(color = "red", linetype = 1, size = 0.5, fill = "cyan", alpha = 1)

####3. Visualisation of two variables (X and Y)####

#3a. Both continous variables (Continous X, Continous Y)

#In mtcars data mpg and disp are two continous variables
#Visualisation options with two continous variables include
# Scatter plot (geom_point)
# Scatter plot for overplotted graphs (geom_jitter)
# Plotting regression line (geom_smooth)
# Quantile regression lines (geom_quantile)


#creating first part of the code
#ggplot (data, aes(x = .., y = ..))
h = ggplot(mtcars, aes(x = mpg, y = disp))



#Scatter plot
#It can also be used for one categorical, or both categorical variables, but
# variations such as geom_jitter(), geom_bin2d() and geom_count() are more appropriate.

#Arguments
#color, alpha, fill,
#stroke.. to modify width of the border (among shapes with a border. eg 24)
#size.. important especially when there is overplotting (overlapping points)
#shape = 0, square; 1, circle; 2, triangle point up; 3, plus; 4, cross
#shape = 5, diamond; 6, triangle point down; 7, square cross; 8, star
#shape = 9, diamond plus; 10, circle plus; 11, triangles up and down
#shape = 12, square plus; 13, circle cross; 14, square and triangle down
#shape = 15, filled square; 16, filled circle; 17, filled triangle point-up
#shape = 18, filled diamond; 19, solid circle; 20, bullet (smaller circle)
#shape = 21, filled circle blue; 22, filled sq; 22, filled square blue
#shape = 23, filled diamond blue; 24, filled triangle point-up blue; 25, filled triangle point down blue
#na.rm = FALSE (removes missing values with a warning)

h + geom_point(color = "blue", alpha =1, stroke = 1.5, fill = "red", size = 2, shape = 24, na.rm = FALSE)

#jitter geom is usually plotted when overplotting is seen on a scatter plot
#it adds a random variation to the position of each point so that they are no longer overlapping
#by default, despite variation, 80% of the implied position will remain same
#to change positive/ negative jitter, height (vertical jitter) and width (horizontal jitter) can be used
h + geom_jitter(color = "blue", alpha =1, stroke = 1.5, fill = "red", size = 2, shape = 24, na.rm = FALSE, height = 0.8, width = 0.8)

#To add regression line
#By default, geom_smooth uses "smoothened conditional means" to plot a line 
#To plot a regression line using linear model, add argument "method = lm"
#arguments = color, linetype, size, fill, alpha, weight,
#se = TRUE by default, display confidence interval across line
#level = level of confidence for plotting "se". 0.95 by default
#na.rm = FALSE removes missing values with warning

h + geom_smooth(method = lm, color = "blue", linetype = 2,  
                size = 2, fill = "red", alpha = 0.8,
                se = TRUE, level = 0.95, na.rm = FALSE)

#Quantile regression
#Quantile regression models the effect of a predictor variable on specific quantiles of response (dependent) variable
#Quantiles = conditional quantiles of y; by default (0.25, 0.5 and 0.75)
#color, linetype, size, 
#lineend = round/butt/square... line end style
#na.rm = FALSE
h + geom_quantile(quantiles = c(0.25, 0.5, 0.75), color = "red", linetype = 3, size = 2, lineend = "round", na.rm = FALSE)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## Smoothing formula not specified. Using: y ~ x

#3b. One discrete and one continous variable (Discrete X, COntinous Y)

#no of cylinders "cyl" is discrete variable in "mtcars" dataset but is stored as numeric
#converting cyl variable as double 
mt = mutate(mtcars, cyl1 = as.factor(mtcars$cyl))
summary(mt)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb       cyl1  
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000   4:11  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000   6: 7  
##  Median :0.0000   Median :4.000   Median :2.000   8:14  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812         
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000         
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
# weight of the car "wt" is a continous variable

#Visualisation options
#barchart (geom_bar)
#boxplot  (geom_boxplot)
#dotplot  (geom_dotplot)
#Violin plot (geom_violin)


#creating first part of the code
a = ggplot(mt, aes(x = cyl1, y = wt))

#creating a barchart 
#remember... total sum of y variable in x categories are calculated
#stat function maps the variables computed by the aes in the first part of the code
a + geom_bar(stat = "identity", color = "red", linetype = 4, size = 0.1, fill = "blue", alpha = 0.5)

#Creating a boxplot
#Gives 5 summary statistics: Q1, Q2, Q3, upper and lower extreme values
#outlier.color , shape, fill , alpha, stroke
#varwidth = TRUE (box width proportional to sqrt of n in each group)
#na.rm
#coef = length of whisker (By default = 1.5IQR)
a + geom_boxplot(outlier.color = "red", varwidth = TRUE, na.rm = FALSE, coef = 1.5)

#Creating a dotplot
#mention binaxis
#color, fill, alpha
a +geom_dotplot(binaxis = "y", color = "red", fill = "yellow")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

#Creating a violin plot
#Violin plots are compact representations...blend of boxplot and densityplot for continous distributions
#color, linetype, fill , alpha
a + geom_violin(color = "red", fill = "cyan")

#3c. Both discrete variables (Discrete X, Discrete Y)
#Jitter plot





####4. Visulisation of three variables####

#factor.. to give different color according to third variable (categorical)
h + geom_point(aes(color = factor(cyl)))

####5. Position adjustments####
#These are done to represent geoms which occupy same space on a plot
#First part of the code
mtc1 = mutate(mtc, cyl1 = as.factor(mtc$cyl))
k = ggplot(mtc1, aes(x=am, fill = cyl1))
#To make multiple bar
k + geom_bar(position = "dodge")

#to make component bar with percentages
k + geom_bar(position = "fill")

#to make component bar with numbers
k + geom_bar(position = "stack")

####6. Facet plots####
#facet_grid forms the matrix of panels according to row/ column variables
#lets take a previously coded scatter plot from section 3 of this code
#creating first part of the code
#ggplot (data, aes(x = .., y = ..))
h = ggplot(mtcars, aes(x = mpg, y = disp))
#Scatter plot
scatter = h + geom_point(color = "blue", alpha =1, stroke = 1.5, fill = "red", size = 2, shape = 24, na.rm = FALSE)
# To make faceted plots
#add layer facet_grid(row variable~ . ~column variable)
#only column wise
scatter + facet_grid(.~cyl)

#only row wise
scatter + facet_grid(cyl~.)

#both row and column wise
scatter + facet_grid(am~.~cyl)

####7. Labels####
#To give main title.. on the top
#subtitle.. for the plot
#x axis and y axis labels
#caption.. placed at bottom right (used for sources, notes, copyright)
#tag.. at top left (usually to tag a figure with a reference letter, etc)

#Creating "m" as First and second part of the code
m = h + geom_point(aes(color = factor(cyl)))
#adding labels
n = m + labs(title = "This is title", subtitle = "Sub", caption = "caption", x = "This is X axis", y = "This is Y axis")
n

####8. Legends####
#legend.position = "right"/ "left"/ "bottom"/ "top"
n +theme(legend.position = "bottom")

####9. Zooming####
#To zoom into a specific area using xlim and ylim functions
n + coord_cartesian(
  xlim = c(10, 20), ylim = c(0, 300))