Body <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/Body.csv")
library(ggplot2)
#library(GGally)
#use this visualisation to quickly scan for the strongest relationships
#ggpairs(Body, columns = c(3,6,7,10:19),axisLabels = "internal")
p3 <- ggplot(data = Body, aes(x = Abdomen, y = BFP_Siri))
p3 + geom_point()
#Add linear and smoothed trend lines
p3 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Add rug plots
p3 + geom_point() + geom_smooth(method = "lm") + geom_smooth(colour = "red") +
geom_rug(alpha = 1/2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#If you have dense data, can represent bivariate density using contours
p3 + geom_point() + geom_density2d()
economics <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/economics.csv")
economics$date <- as.Date(economics$date, format = "%d/%m/%Y")
#time series plot
p4 <-ggplot(data = economics, aes(x = date, pop))
p4 + geom_line() + labs(title = "US Population Growth 1967 - 2015")
#convert to long dataset format
library(tidyr) #Load tidyr to access gather() function
economics_l <- gather(economics, # Data frame
Variable, # Name of the variable to contain the original variable names
Value, # Name of the variable to contain the variables' values
pce:unemploy) # The variables to be merged into long format
economics_l$Variable <- factor(economics_l$Variable, # Define and label variable factor
labels = c("PCE",
"Population '000",
"PSR",
"Unemployed '000",
"Unemployed Duration"
))
head(economics_l)
## date Variable Value
## 1 1967-01-07 PCE 507.4
## 2 1967-01-08 PCE 510.5
## 3 1967-01-09 PCE 516.3
## 4 1967-01-10 PCE 512.9
## 5 1967-01-11 PCE 518.1
## 6 1967-01-12 PCE 525.8
#plot
p4 <-ggplot(data = economics_l, aes(x = date, y = Value))
p4 + geom_line() + facet_grid(Variable ~ ., scales = "free",
labeller = label_value) +
labs(title = "US Economic Data 1967 - 2015", y = "")
#smooth data by using quarterly data instead of monthly
#Reduce resolution
#Add a month and year variable to the wide economics dataset
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
economics <- mutate(economics,
quarter = quarters(date),
year = format(economics$date, "%Y"))
#Group the data by year and quarter
economics_ag<-group_by(economics, year, quarter)
#Create a summarised dataset with mean values for yearly quarters
#Save the date for the last day of each quarter
economics_ag <- summarise(economics_ag,
pce = mean(pce),
pop = mean(pop),
psavert = mean(psavert),
uempmed = mean(uempmed),
unemploy = mean(unemploy),
date = max(date))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
#Restructure data to long format
economics_ag_l <- gather(economics_ag,
Variable,
Value,
pce:unemploy)
#Assign factor and labels
economics_ag_l$Variable <- factor(economics_ag_l$Variable, #Define and label variable factor
labels = c("PCE",
"Population '000",
"PSR",
"Unemployed '000",
"Unemployed Duration"))
head(economics_ag_l)
## # A tibble: 6 x 5
## # Groups: year [4]
## year quarter date Variable Value
## <chr> <chr> <date> <fct> <dbl>
## 1 1967 Q1 1967-01-12 PCE 515.
## 2 1968 Q1 1968-01-12 PCE 557.
## 3 1969 Q1 1969-01-12 PCE 604.
## 4 1970 Q1 1970-03-01 PCE 633.
## 5 1970 Q2 1970-06-01 PCE 643.
## 6 1970 Q3 1970-09-01 PCE 654.
#plot quarterly data
p5<-ggplot(data = economics_ag_l, aes(x = date, y = Value))
p5 + geom_line() + facet_grid(Variable ~ ., scales = "free",
labeller = label_value) +
labs(title = "US Economic Quarterly Data 1967 - 2015 ", y = "")
# One Qual on Quant variable
mpg <- read.csv("C:/Users/Jason/Desktop/Grad Cert Data Science/3. Data Visualisation/data/mpg.csv")
p6 <- ggplot(data = mpg, aes(x = class, y = cty))
p6 + geom_boxplot()
#order from lowest to highest
mpg_rank <- mpg %>% group_by(class) %>% summarise(med = median(cty))
mpg$class <- mpg$class %>% factor(levels = mpg_rank$class[order(-mpg_rank$med)])
p6 <- ggplot(data = mpg, aes(x = class, y = cty))
p6 + geom_boxplot()
#horizontal
p6 + geom_boxplot() + coord_flip()
## Side-by-side variations
#violins
p6 + geom_violin() +
stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
#jittered point plots
p6 + geom_jitter(width = .2, alpha = .25) +
stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
#stacked dot plots
p6 + geom_dotplot(binaxis = "y", stackdir = "center", dotsize = 1/2, alpha = .25) +
stat_summary(fun.y = "mean", geom = "point", colour = "red")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.