#install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 quattro 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 quattro 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 quattro 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 224 more rows
view(mpg)
ggplot(data = mpg)
# aes is aesthetic mapping
ggplot(data=mpg, aes(x=displ, y = hwy))
ggplot(data = mpg, aes(x=hwy)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = mpg, aes(x=hwy)) + geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = mpg, aes(x=hwy)) + geom_density()
ggplot(data = mpg, aes(x=class))+geom_bar()
ggplot(data = mpg, aes(x=displ, y =hwy))+geom_point()
ggplot(data = mpg, aes(x =displ, y = hwy))+ geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
ggplot(data = mpg, aes(x=class, y=hwy)) + geom_violin()
ggplot(data = mpg, aes(x=cty))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = mpg, aes(x=manufacturer))+ geom_bar()
# scatter plot
ggplot(data=mpg,aes(x=displ, y =cty)) + geom_point()
ggplot(data = mpg, aes(x=displ, y = hwy)) + geom_point(color = "blue", size = 2, shape = 17, alpha = 0.5)
ggplot(data = mpg, aes(x=displ, y = hwy)) + geom_jitter(color = "blue", size = 2, shape = 17, alpha = 0.5)
ggplot(data = mpg, aes(x=displ, y =hwy)) + geom_point(color = "blue")
ggplot(data = mpg, aes(x=displ, y =hwy, color = class)) + geom_point()
# If we are using “blue” here, we get “red” dots.
ggplot(data = mpg, aes(x=displ, y =hwy, color ="blue")) + geom_point()
?mpg
## starting httpd help server ... done
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 quattro 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 quattro 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 quattro 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 224 more rows
ggplot(mpg, aes(displ, hwy, color = cty, size = displ)) + geom_point()
ggplot(mpg, aes(displ, cty, color = drv, shape = drv)) + geom_point()
#stroke
ggplot(mpg, aes(displ, cty)) + geom_point(shape=21, color = "black", fill = "white", size = 5, stroke = 2)
ggplot(mpg, aes(displ, cty, color = displ<5))+geom_point()
#facet functions provide a simple way to create small multiples
# facet_wrap - create multiples based on a single variable
# facet_grid - create small multiples grid based on 2 variables
# nrow or ncol - specifies dimensions
ggplot(data = mpg, aes(x=displ, y = hwy)) + geom_point() + facet_wrap(~class, nrow = 2)
?facet_wrap
?facet_grid
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv~cyl)
Create a scatter plot of displ vs cty facetted by year
ggplot(mpg, aes(displ, cty)) + geom_point() + facet_wrap(~year)
Create a scatter plot of displ vs cty facetted by year and cyl
ggplot(mpg, aes(displ, cty))+ geom_point() + facet_grid(year~cyl)
How does placement within facet_grid(cyl ~ year) affect the output?
ggplot(mpg, aes(displ, cty)) + geom_point()+ facet_grid(cyl~year)
4.How does facet_grid(cyl ~ year) differ from facet_grid(~ year + cyl)
ggplot(mpg, aes(displ, cty)) + geom_point()+ facet_grid(~year+cyl)
5.What do the scales and space arguments do?
ggplot(mpg, aes(displ, cty)) + geom_point() + facet_grid(year~cyl, scales = "free")
ggplot(mpg, aes(displ, cty)) + geom_point() + facet_grid(year~cyl, scales = "free", space ="free")
OVERPLOTTING Layering helps display patterns
mpg%>%
ggplot(aes(x=displ, y = hwy))+
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
mpg%>%
ggplot(aes(x=displ, y = hwy, color = drv))+
geom_point()+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
mpg%>%
ggplot(aes(x=displ, y = hwy))+
geom_point(aes(shape = drv))+
geom_smooth(aes(color = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
What is driving this upward swing (see teh right side of the layering)?
mpg%>%
ggplot(aes(x=displ, y = hwy)) +
geom_point(aes(color = class))+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Is it the two red seaters between 6 & 7 above the curve. Let’s layer further!
mpg%>%
ggplot(aes(x=displ, y = hwy)) +
geom_point(aes(color = class =="2seater"))+
geom_smooth(data = filter(mpg, class =="2seater"), se = FALSE)+
geom_smooth(data = filter(mpg, class != "2seater"), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 5.6935
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.5065
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.65044
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
‘se’ in the code shows confidence interval around the curve.
Another Overplotting example :
ggplot(data = mpg, aes(x=class, y = hwy))+
geom_boxplot()+
geom_jitter(width=0.2, alpha = 0.5)
ggplot(data = mpg, aes(x=displ, y=cty))+
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
A rug plot is a compact visualisation designed to supplement a 2d display with the two 1d marginal distributions. Rug plots display individual cases so are best used with smaller datasets.
ggplot(data = mpg, aes(x=displ, y=cty))+
geom_smooth()+
geom_rug()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
POSITIONING 1) BAR CHARTS
All geoms have a position argument but rarely you will need to adjust it. geom_bar and a few others benefit from its use though. - position = “stack” (default) - position - “fill” - position - “dodge”
ggplot(mpg, aes(class, fill = factor(cyl)))+
geom_bar(position = "fill")
ggplot(mpg, aes(class, fill = factor(cyl)))+
geom_bar(position = "stack")
ggplot(mpg, aes(class, fill = factor(cyl)))+
geom_bar(position = "dodge")
COORDINATE SYSTEM Manipulating your coordinates There are many option to manipulate and adjust the coordinate system but most basic ones include: 1) Flipping the coordinates
ggplot(data = mpg, aes(x=class, y = hwy))+
geom_boxplot()
ggplot(data = mpg, aes(x=class, y = hwy))+
geom_boxplot()+
coord_flip()
There are many options to manipulate and adjust the coordinate system but some basic ones include: •zooming in or out
ggplot(data = mpg, aes(x=displ, y = cty)) + geom_jitter()+ coord_cartesian(xlim = c(4,7), ylim = c(10,20))
•formatting axes and labels
ggplot(data = txhousing, aes(x=volume, y = median))+
geom_point(alpha = 0.25)+
scale_y_continuous(name ="Median Sale Price", labels = scales::dollar) +
scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
ggtitle("Texas Housing Sales", subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center")
## Warning: Removed 617 rows containing missing values (geom_point).
• Creating pi charts
ggplot(data = mpg, aes(class, fill = factor(year))) + geom_bar()+ coord_polar()
mpg%>%
ggplot(aes(class, fill = factor(cyl))) +
geom_bar(position = "fill") +
scale_y_continuous(name = "Percent", labels = scales::percent) +
coord_flip()
https://rpubs.com/bradleyboehmke/weather_graphic
# install.packages("ggplot2movies")
library(ggplot2movies)
library(tidyverse)
movies
## # A tibble: 58,788 x 24
## title year length budget rating votes r1 r2 r3 r4 r5 r6
## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5 4.5 14.5 24.5
## 2 $1000 a~ 1939 71 NA 6 20 0 14.5 4.5 24.5 14.5 14.5
## 3 $21 a D~ 1941 7 NA 8.2 5 0 0 0 0 0 24.5
## 4 $40,000 1996 70 NA 8.2 6 14.5 0 0 0 0 0
## 5 $50,000~ 1975 71 NA 3.4 17 24.5 4.5 0 14.5 14.5 4.5
## 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5 14.5 14.5 14.5
## 7 $windle 2002 93 NA 5.3 200 4.5 0 4.5 4.5 24.5 24.5
## 8 '15' 2002 25 NA 6.7 24 4.5 4.5 4.5 4.5 4.5 14.5
## 9 '38 1987 97 NA 6.6 18 4.5 4.5 4.5 0 0 0
## 10 '49-'17 1917 61 NA 6 51 4.5 0 4.5 4.5 4.5 44.5
## # ... with 58,778 more rows, and 12 more variables: r7 <dbl>, r8 <dbl>,
## # r9 <dbl>, r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>,
## # Comedy <int>, Drama <int>, Documentary <int>, Romance <int>, Short <int>
ggplot(data = movies, aes(length)) + geom_bar()