#install.packages("tidyverse")

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

mpg

## # A tibble: 234 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows

view(mpg)

ggplot(data = mpg)

# aes is aesthetic mapping

ggplot(data=mpg, aes(x=displ, y = hwy))

ggplot(data = mpg, aes(x=hwy)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = mpg, aes(x=hwy)) + geom_freqpoly()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = mpg, aes(x=hwy)) + geom_density()

ggplot(data = mpg, aes(x=class))+geom_bar()

ggplot(data = mpg, aes(x=displ, y =hwy))+geom_point()

ggplot(data = mpg, aes(x =displ, y = hwy))+ geom_boxplot()

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

ggplot(data = mpg, aes(x=class, y=hwy)) + geom_violin()

ggplot(data = mpg, aes(x=cty))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = mpg, aes(x=manufacturer))+ geom_bar()

# scatter plot

ggplot(data=mpg,aes(x=displ, y =cty)) + geom_point()

ggplot(data = mpg, aes(x=displ, y = hwy)) + geom_point(color = "blue", size = 2, shape = 17, alpha = 0.5)

ggplot(data = mpg, aes(x=displ, y = hwy)) + geom_jitter(color = "blue", size = 2, shape = 17, alpha = 0.5)

ggplot(data = mpg, aes(x=displ, y =hwy)) + geom_point(color = "blue")

ggplot(data = mpg, aes(x=displ, y =hwy, color = class)) + geom_point()

# If we are using “blue” here, we get “red” dots.

ggplot(data = mpg, aes(x=displ, y =hwy, color ="blue")) + geom_point()

?mpg

## starting httpd help server ... done

mpg

## # A tibble: 234 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows

continuous variable cannot be mapped to shape

 ggplot(mpg, aes(displ, hwy, color = cty, size = displ)) + geom_point()

ggplot(mpg, aes(displ, cty, color = drv, shape = drv)) + geom_point()

#stroke

ggplot(mpg, aes(displ, cty)) + geom_point(shape=21, color = "black", fill = "white", size = 5, stroke = 2)

ggplot(mpg, aes(displ, cty, color = displ<5))+geom_point()

#facet functions provide a simple way to create small multiples
# facet_wrap - create multiples based on a single variable
# facet_grid - create small multiples grid based on 2 variables
# nrow or ncol - specifies dimensions

ggplot(data = mpg, aes(x=displ, y = hwy)) + geom_point() + facet_wrap(~class, nrow = 2)

?facet_wrap
?facet_grid

What do the empty cells in plot with facet_grid(drv ~ cyl) mean? How do they relate to this plot? The facets are empty when there is no data for the according combination e.g. rear wheel drive (r) with 4 or 5 cylinder is not listed. The 7 cylinder factes are missed entirely.

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv~cyl)

Create a scatter plot of displ vs cty facetted by year

ggplot(mpg, aes(displ, cty)) + geom_point() + facet_wrap(~year)

Create a scatter plot of displ vs cty facetted by year and cyl

ggplot(mpg, aes(displ, cty))+ geom_point() + facet_grid(year~cyl)

How does placement within facet_grid(cyl ~ year) affect the output?

ggplot(mpg, aes(displ, cty)) + geom_point()+ facet_grid(cyl~year)

4.How does facet_grid(cyl ~ year) differ from facet_grid(~ year + cyl)

ggplot(mpg, aes(displ, cty)) + geom_point()+ facet_grid(~year+cyl)

5.What do the scales and space arguments do?

ggplot(mpg, aes(displ, cty)) + geom_point() + facet_grid(year~cyl, scales = "free")

ggplot(mpg, aes(displ, cty)) + geom_point() + facet_grid(year~cyl, scales = "free", space ="free")

OVERPLOTTING Layering helps display patterns

mpg%>%
  ggplot(aes(x=displ, y = hwy))+
  geom_point() +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

mpg%>%
  ggplot(aes(x=displ, y = hwy, color = drv))+
  geom_point()+
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

mpg%>%
  ggplot(aes(x=displ, y = hwy))+
  geom_point(aes(shape = drv))+
  geom_smooth(aes(color = drv))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

What is driving this upward swing (see teh right side of the layering)?

mpg%>%
  ggplot(aes(x=displ, y = hwy)) +
  geom_point(aes(color = class))+
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Is it the two red seaters between 6 & 7 above the curve. Let’s layer further!

mpg%>%
  ggplot(aes(x=displ, y = hwy)) +
  geom_point(aes(color = class =="2seater"))+
  geom_smooth(data = filter(mpg, class =="2seater"), se = FALSE)+
  geom_smooth(data = filter(mpg, class != "2seater"), se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : span too small. fewer data values than degrees of freedom.

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at 5.6935

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.5065

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 0.65044

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

‘se’ in the code shows confidence interval around the curve.

Another Overplotting example :

ggplot(data = mpg, aes(x=class, y = hwy))+
  geom_boxplot()+
  geom_jitter(width=0.2, alpha = 0.5)

ggplot(data = mpg, aes(x=displ, y=cty))+
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

A rug plot is a compact visualisation designed to supplement a 2d display with the two 1d marginal distributions. Rug plots display individual cases so are best used with smaller datasets.

ggplot(data = mpg, aes(x=displ, y=cty))+
  geom_smooth()+
  geom_rug()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

POSITIONING 1) BAR CHARTS

All geoms have a position argument but rarely you will need to adjust it. geom_bar and a few others benefit from its use though. - position = “stack” (default) - position - “fill” - position - “dodge”

ggplot(mpg, aes(class, fill = factor(cyl)))+
  geom_bar(position = "fill")

ggplot(mpg, aes(class, fill = factor(cyl)))+
  geom_bar(position = "stack")

ggplot(mpg, aes(class, fill = factor(cyl)))+
  geom_bar(position = "dodge")

COORDINATE SYSTEM Manipulating your coordinates There are many option to manipulate and adjust the coordinate system but most basic ones include: 1) Flipping the coordinates

ggplot(data = mpg, aes(x=class, y = hwy))+
  geom_boxplot()

ggplot(data = mpg, aes(x=class, y = hwy))+
  geom_boxplot()+
  coord_flip()

There are many options to manipulate and adjust the coordinate system but some basic ones include: •zooming in or out

ggplot(data = mpg, aes(x=displ, y = cty)) + geom_jitter()+ coord_cartesian(xlim = c(4,7), ylim = c(10,20))

•formatting axes and labels

ggplot(data = txhousing, aes(x=volume, y = median))+
  geom_point(alpha = 0.25)+
  scale_y_continuous(name ="Median Sale Price", labels = scales::dollar) +
  scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
  ggtitle("Texas Housing Sales", subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center")

## Warning: Removed 617 rows containing missing values (geom_point).

• Creating pi charts

ggplot(data = mpg, aes(class, fill = factor(year))) + geom_bar()+ coord_polar()

mpg%>%
ggplot(aes(class, fill = factor(cyl))) + 
  geom_bar(position = "fill") +
  scale_y_continuous(name = "Percent", labels = scales::percent) +
  coord_flip()

https://rpubs.com/bradleyboehmke/weather_graphic

# install.packages("ggplot2movies")

library(ggplot2movies)
library(tidyverse)

movies

## # A tibble: 58,788 x 24
##    title     year length budget rating votes    r1    r2    r3    r4    r5    r6
##    <chr>    <int>  <int>  <int>  <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 $         1971    121     NA    6.4   348   4.5   4.5   4.5   4.5  14.5  24.5
##  2 $1000 a~  1939     71     NA    6      20   0    14.5   4.5  24.5  14.5  14.5
##  3 $21 a D~  1941      7     NA    8.2     5   0     0     0     0     0    24.5
##  4 $40,000   1996     70     NA    8.2     6  14.5   0     0     0     0     0  
##  5 $50,000~  1975     71     NA    3.4    17  24.5   4.5   0    14.5  14.5   4.5
##  6 $pent     2000     91     NA    4.3    45   4.5   4.5   4.5  14.5  14.5  14.5
##  7 $windle   2002     93     NA    5.3   200   4.5   0     4.5   4.5  24.5  24.5
##  8 '15'      2002     25     NA    6.7    24   4.5   4.5   4.5   4.5   4.5  14.5
##  9 '38       1987     97     NA    6.6    18   4.5   4.5   4.5   0     0     0  
## 10 '49-'17   1917     61     NA    6      51   4.5   0     4.5   4.5   4.5  44.5
## # ... with 58,778 more rows, and 12 more variables: r7 <dbl>, r8 <dbl>,
## #   r9 <dbl>, r10 <dbl>, mpaa <chr>, Action <int>, Animation <int>,
## #   Comedy <int>, Drama <int>, Documentary <int>, Romance <int>, Short <int>

ggplot(data = movies, aes(length)) + geom_bar()

Week 5 Data Wrangling

continuous variable cannot be mapped to shape