# Mindanao State University
# General Santos City
# Submitted by: Angga, Princess Joy & Davy D. Dongosa
# Visualizations in R
# Math 108
# Task is to Perform About Advance Visualizations in R
# Task 1: Correlation and Sub-sections
# The following plots help to examine how well correlated two variables are
# Scatterplot (most frequently used plot for data analysis to understand the nature of relationship between two variables)
# It can be drawn using geom_point()
# Additionally, geom_smooth which draws a smoothing line (based on loess) by default, can be tweaked to draw the line of best fit by setting method='lm'
# Steps to make a scatterplot
# install.packages("ggplot2")
# load package and data
options(scipen=999) # turn-off scientific notation like 1e+48
library(ggplot2)
theme_set(theme_bw()) # pre-set the bw theme.
data("midwest", package = "ggplot2")
# midwest <- read.csv("http://goo.gl/G1K41K") # bkup data source
# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot",
caption = "Source: midwest")
plot(gg)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 15 rows containing missing values (`geom_point()`).

# Scatterplot With Encircling (to encirlce certain special group of points or region in the chart so as to draw the attention to those peculiar cases, when presenting the results)
# This can be conveniently done using the geom_encircle() in ggalt package
# Within geom_encircle(), set the data to a new dataframe that contains only the points (rows) or interest
# Moreover, You can expand the curve so as to pass just outside the points
# The color and size (thickness) of the curve can be modified as well
# Steps to make a scatterplot with encircling
# install 'ggalt' pkg
# devtools::install_github("hrbrmstr/ggalt")
options(scipen = 999)
library(ggplot2)
# install.packages("ggalt")
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
midwest_select <- midwest[midwest$poptotal > 350000 &
midwest$poptotal <= 500000 &
midwest$area > 0.01 &
midwest$area < 0.1, ]
# Plot
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) + # draw points
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) + # draw smoothing line
geom_encircle(aes(x=area, y=poptotal),
data=midwest_select,
color="red",
size=2,
expand=0.08) + # encircle
labs(subtitle="Area Vs Population",
y="Population",
x="Area",
title="Scatterplot + Encircle",
caption="Source: midwest")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (`stat_smooth()`).
## Removed 15 rows containing missing values (`geom_point()`).

# Jitter Plot
# use the mpg dataset to plot city mileage (cty) vs highway mileage (hwy)
# Steps to make the plot
# load package and data
library(ggplot2)
data(mpg, package="ggplot2") # alternate source: "http://goo.gl/uEeRGu")
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
# Scatterplot
g + geom_point() +
geom_smooth(method="lm", se=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Scatterplot with overlapping points",
caption="Source: midwest")
## `geom_smooth()` using formula = 'y ~ x'

# What we have here is a scatterplot of city and highway mileage in mpg dataset
# Scatterplot with overlapping points
# But, this innocent looking plot is hiding something
# The original data has 234 data points but the chart seems to display fewer points
# What has happened? This is because there are many overlapping points appearing as a single dot
# The fact that both cty and hwy are integers in the source dataset made it all the more convenient to hide this detail
# So just be extra careful the next time you make scatterplot with integers
# So how to handle this? There are few options
# We can make a jitter plot with jitter_geom()
# As the name suggests, the overlapping points are randomly jittered around its original position based on a threshold controlled by the width argument
# Steps to make jitter plot
# load package and data
library(ggplot2)
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
g + geom_jitter(width = .5, size=1) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Jittered Points")

# The second option to overcome the problem of data points overlap is to use what is called a counts chart
# Whereever there is more points overlap, the size of the circle gets bigger
# Steps to make a counts chart
# load package and data
library(ggplot2)
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
g + geom_count(col="tomato3", show.legend=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Jittered Points")
