Load library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(gapminder)

Saving graphs - there are several methods to save your visualization programmatically. The first is using the png() function. It will save output plot.

# set the file name for the first graph
png(file = "diamonds_loess.png")
ggplot(data = gapminder, 
       mapping = aes(x= gdpPercap,
                     y = lifeExp)) +
      geom_point(mapping = aes(color = continent)) +
      geom_smooth(method = "loess") +
      scale_x_log10()

## `geom_smooth()` using formula = 'y ~ x'

# set the file name for the first graph
png(file = "diamonds_lm.png")
ggplot(data = gapminder, 
       mapping = aes(x= gdpPercap,
                     y = lifeExp)) +
      geom_point(mapping = aes(color = continent)) +
      geom_smooth(method = "lm") +
      scale_x_log10()

## `geom_smooth()` using formula = 'y ~ x'

# Save the file
dev.off()

## png 
##   2

The second file saving method is to use ggsave() function. ggsave(file = filename.ext, plot = , width = 10, height = 8)

plot <- ggplot(data = gapminder, 
       mapping = aes(x= gdpPercap,
                     y = lifeExp)) +
      geom_point(mapping = aes(color = continent)) +
      geom_smooth(method = "loess") +
      scale_x_log10()


# Save the plot as a png
ggsave(file = "diamonds_ggs.png", plot = plot, width = 10, height = 5)

## `geom_smooth()` using formula = 'y ~ x'

Goal: Plot the trajectory of life expectancy over time for each country

# Use the aes color mapping to country. Save the plot because it is huge.
p <- ggplot(data = gapminder, 
       mapping = aes(x= year,
                     y = gdpPercap)) +
      geom_point(mapping = aes(color = country))

# Save the plot as a png
ggsave(file = "YearVsgdp_country.png", plot = p, width = 20, height = 20)

Grouped Data - With the group aes property, separate lines can be created for each grouping. In this case, a separate line is createdm for each line, grouped by country

p <- ggplot(data = gapminder, 
       mapping = aes(x= year,
                     y = gdpPercap)) +
      geom_point(mapping = aes(group = country))

p <- ggplot(data = gapminder, 
       mapping = aes(x= year,
                     y = gdpPercap)) +
      geom_line(mapping = aes(group = country, color = country))


# Save the plot as a png
ggsave(file = "YearVsgdp_Groupedcountry.png", plot = p, width = 20, height = 20)

Faceting data - means making multiple small graphs to focus on subsets of data. To facet data, use the facet_wrap() function.

facet <- ggplot(data = gapminder, 
                mapping = aes(x= year,
                     y = gdpPercap)) +
        geom_line(mapping = aes(group = country)) +
        facet_wrap(~continent)
facet

Set the column layout for facets by using the “ncol” property of facet_wrap. ncol determines how many subplots to generate on one row.

facet <- ggplot(data = gapminder, 
                mapping = aes(x= year,
                     y = gdpPercap)) +
        geom_line(color = "#f857a6", mapping = aes(group = country)) +
        geom_smooth(size = 1.1, method = "loess", color = "lightblue", se = FALSE) +
        scale_y_log10(labels = scales::dollar) +
        facet_wrap(~continent, ncol = 5) +
        labs(x = "Year",
             y = "GDP per capita",
             title = "GDP per Capita on Five Continents",
             subtitle = "Source: Gapminder")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

facet

## `geom_smooth()` using formula = 'y ~ x'

ggsave(file = "yearVSGDP_facetNcol.png", plot = facet)

## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'

facet <- ggplot(data = gapminder, 
                mapping = aes(x= year,
                     y = gdpPercap)) +
        geom_line(color = "#f64f59", mapping = aes(group = country)) +
        geom_smooth(size = 1.1, method = "loess", color = "#12c2e9", se = FALSE) +
        scale_y_log10(labels = scales::dollar) +
        theme(axis.text.x = element_text(angle = 90)) + 
        facet_wrap(~continent, ncol = 5) +
        labs(x = "Year",
             y = "GDP per capita",
             title = "GDP per Capita on Five Continents",
             subtitle = "Source: Gapminder")

facet

## `geom_smooth()` using formula = 'y ~ x'

ggsave(file = "yearVSGDP_facetNcol_x90.png", plot = facet)

## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'

Facet: facet_grid()

Facets can also be used in grid form to compare multiple axes of data. This removes the limitation of 1 row with many columns. To prepare the examples, switch datasets to gss_sm

# Take a peak at the new data set
#devtools::install_github("kjhealy/socviz") 
library(socviz)
glimpse(gss_sm)

## Rows: 2,867
## Columns: 32
## $ year        <dbl> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016…
## $ id          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ ballot      <labelled> 1, 2, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 3, 2, 3, 3, 2,…
## $ age         <dbl> 47, 61, 72, 43, 55, 53, 50, 23, 45, 71, 33, 86, 32, 60, 76…
## $ childs      <dbl> 3, 0, 2, 4, 2, 2, 2, 3, 3, 4, 5, 4, 3, 5, 7, 2, 6, 5, 0, 2…
## $ sibs        <labelled> 2, 3, 3, 3, 2, 2, 2, 6, 5, 1, 4, 4, 3, 6, 0, 1, 3, 8,…
## $ degree      <fct> Bachelor, High School, Bachelor, High School, Graduate, Ju…
## $ race        <fct> White, White, White, White, White, White, White, Other, Bl…
## $ sex         <fct> Male, Male, Male, Female, Female, Female, Male, Female, Ma…
## $ region      <fct> New England, New England, New England, New England, New En…
## $ income16    <fct> $170000 or over, $50000 to 59999, $75000 to $89999, $17000…
## $ relig       <fct> None, None, Catholic, Catholic, None, None, None, Catholic…
## $ marital     <fct> Married, Never Married, Married, Married, Married, Married…
## $ padeg       <fct> Graduate, Lt High School, High School, NA, Bachelor, NA, H…
## $ madeg       <fct> High School, High School, Lt High School, High School, Hig…
## $ partyid     <fct> "Independent", "Ind,near Dem", "Not Str Republican", "Not …
## $ polviews    <fct> Moderate, Liberal, Conservative, Moderate, Slightly Libera…
## $ happy       <fct> Pretty Happy, Pretty Happy, Very Happy, Pretty Happy, Very…
## $ partners    <fct> NA, "1 Partner", "1 Partner", NA, "1 Partner", "1 Partner"…
## $ grass       <fct> NA, Legal, Not Legal, NA, Legal, Legal, NA, Not Legal, NA,…
## $ zodiac      <fct> Aquarius, Scorpio, Pisces, Cancer, Scorpio, Scorpio, Capri…
## $ pres12      <labelled> 3, 1, 2, 2, 1, 1, NA, NA, NA, 2, NA, NA, 1, 1, 2, 1, …
## $ wtssall     <dbl> 0.9569935, 0.4784968, 0.9569935, 1.9139870, 1.4354903, 0.9…
## $ income_rc   <fct> Gt $170000, Gt $50000, Gt $75000, Gt $170000, Gt $170000, …
## $ agegrp      <fct> Age 45-55, Age 55-65, Age 65+, Age 35-45, Age 45-55, Age 4…
## $ ageq        <fct> Age 34-49, Age 49-62, Age 62+, Age 34-49, Age 49-62, Age 4…
## $ siblings    <fct> 2, 3, 3, 3, 2, 2, 2, 6+, 5, 1, 4, 4, 3, 6+, 0, 1, 3, 6+, 2…
## $ kids        <fct> 3, 0, 2, 4+, 2, 2, 2, 3, 3, 4+, 4+, 4+, 3, 4+, 4+, 2, 4+, …
## $ religion    <fct> None, None, Catholic, Catholic, None, None, None, Catholic…
## $ bigregion   <fct> Northeast, Northeast, Northeast, Northeast, Northeast, Nor…
## $ partners_rc <fct> NA, 1, 1, NA, 1, 1, NA, 1, NA, 3, 1, NA, 1, NA, 0, 1, 0, N…
## $ obama       <dbl> 0, 1, 0, 0, 1, 1, NA, NA, NA, 0, NA, NA, 1, 1, 0, 1, 0, 1,…

Adding a 2D facet to a plot, use facet_grid(). Goal: Plot the relationship between age and number of children, faceted by sex (rows) and race (columns)

a <-  ggplot(data = gss_sm,
             mapping = aes(x = age, y = childs)) + 
      geom_point(alpha = 0.2) + 
      geom_smooth() +
      facet_grid(sex ~ race)
a

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

## Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 18 rows containing missing values (`geom_point()`).

Bar charts - data can be transformed via geoms to show a particular relationship or to clarify a point starts with a simple bar graph (historgram)

# Use the geom_bar() to produce a historgram
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion)) +
    geom_bar()
p

Instead of using stat_count() by default, we can use “..prop..” to return relative frequencies. The “..prop..” is a temporary variable. Temporary variables remove the chance of conflict with other variables name prop or count

# Using the "..prop.." stat_function, display the proportion of big region frequencies
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion)) +
     geom_bar(mapping = aes(y = ..prop..))
p

## Warning: The dot-dot notation (`..prop..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(prop)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Above is not what we wanted because it is the individual proportions. We want proportions scaled across all values to sum to 1. Use a dummy groupig of 1

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion)) +
     geom_bar(mapping = aes(y = ..prop.., group = 1))
p

gss_sm also has a survey on religious preferences (religion). To take a look at the counts of each answer, the table (data$column) function can be used

table(gss_sm$religion)

## 
## Protestant   Catholic     Jewish       None      Other 
##       1371        649         51        619        159

To convert the table to a bar chart, make religion the x, omit the y.

# Color coded outline
p <- ggplot(data = gss_sm,
            mapping = aes(x = religion, color = religion)) +
     geom_bar()
p

# Filled in colored bars
p <- ggplot(data = gss_sm,
            mapping = aes(x = religion, color = religion)) +
     geom_bar() + guides(fill = FALSE)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Do the same for polview instead of religion. We use the theme() function to amke the x axis more readable.

# Color coded outline
p <- ggplot(data = gss_sm,
            mapping = aes(x = polviews, color = polviews)) +
     geom_bar()
p

# Filled in colored bars
p <- ggplot(data = gss_sm,
            mapping = aes(x = polviews, color = polviews)) +
     geom_bar() + guides(fill = FALSE) + 
     theme(axis.text.x = element_text(angle = 90))
p

Frequency plots display how often something happens

# Show color coded religion frequencies by region
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          fill = religion)) +
    geom_bar()
p

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          fill = religion)) +
    geom_bar(position = "fill")
p

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          fill = religion)) +
    geom_bar(position = "dodge")
p

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          fill = religion)) +
    geom_bar(position = "dodge",
             mapping = aes(y = ..prop..))
p

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion,
                          fill = religion)) +
    geom_bar(position = "dodge",
             mapping = aes(y = ..prop..,
                           group = religion))
p

Histograms - Summarizes a continuous variable by chopping it up into segments(bins) and counting how many observations in each bin. We must determine how finely to bin the data. Use the geom_histogram().

First, we switch to the midwest data set

head(midwest)

## # A tibble: 6 × 28
##     PID county   state  area poptotal popdensity popwhite popblack popamerindian
##   <int> <chr>    <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
## 1   561 ADAMS    IL    0.052    66090      1271.    63917     1702            98
## 2   562 ALEXAND… IL    0.014    10626       759      7054     3496            19
## 3   563 BOND     IL    0.022    14991       681.    14477      429            35
## 4   564 BOONE    IL    0.017    30806      1812.    29344      127            46
## 5   565 BROWN    IL    0.018     5836       324.     5264      547            14
## 6   566 BUREAU   IL    0.05     35688       714.    35157       50            65
## # ℹ 19 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## #   percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## #   popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## #   poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## #   percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## #   percelderlypoverty <dbl>, inmetro <int>, category <chr>

Make a histogram that shows the distribution of country sizes

# Let geom_histogram determine the bins
p <- ggplot(data = midwest,
            mapping = aes(x  = area)) +
    geom_histogram()
p

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Set bins = 10
p <- ggplot(data = midwest,
            mapping = aes(x  = area)) +
    geom_histogram(bins = 10)
p

Histogram is with multiple distributions. Allows multiple bar graphs to be displazyed in one plot. Here, we are going to add a subset of the midwest table and use the %in% operator

# Create a vector of the state abbreviations we want
oh_wi <- c("OH","WI")

# Select all midwest rows, all columns where the stae is either OH or WI
midwest_ohwi <- subset(midwest, subset = state %in% oh_wi)

head(midwest_ohwi)

## # A tibble: 6 × 28
##     PID county   state  area poptotal popdensity popwhite popblack popamerindian
##   <int> <chr>    <chr> <dbl>    <int>      <dbl>    <int>    <int>         <int>
## 1  2009 ADAMS    OH    0.035    25371       725.    25212       47            67
## 2  2010 ALLEN    OH    0.024   109755      4573.    96177    12313           202
## 3  2011 ASHLAND  OH    0.025    47507      1900.    46686      460            49
## 4  2012 ASHTABU… OH    0.041    99821      2435.    95465     3138           196
## 5  2013 ATHENS   OH    0.03     59549      1985.    56163     1678           167
## 6  2014 AUGLAIZE OH    0.024    44585      1858.    44225       66            50
## # ℹ 19 more variables: popasian <int>, popother <int>, percwhite <dbl>,
## #   percblack <dbl>, percamerindan <dbl>, percasian <dbl>, percother <dbl>,
## #   popadults <int>, perchsd <dbl>, percollege <dbl>, percprof <dbl>,
## #   poppovertyknown <int>, percpovertyknown <dbl>, percbelowpoverty <dbl>,
## #   percchildbelowpovert <dbl>, percadultpoverty <dbl>,
## #   percelderlypoverty <dbl>, inmetro <int>, category <chr>

Display the histogram of only Ohio and Wisconsin county area distribution

# Create a histogram of the ohwi subset and color code by state
p <- ggplot(data = midwest_ohwi,
            mapping = aes(x = area, fill = state)) +
  geom_histogram(alpha = 0.4, bins = 20)
p

Density Charts - use a continuous variable to calculate the kernel density estimate of the underlying distribution. It is an alternative to histogram’s binning.

Use the geom_density() function to create a smoother distribution mapping.

p <- ggplot(data = midwest_ohwi,
            mapping = aes(x = area, fill = state)) +
  geom_density(alpha = 0.4)
p

Avoid Unnecessary Transformations

ggplot() automatically does data transformation work for you. Sometimes you want to disable it because the data is already transformed. Of the geom_, set stat = “identity” to disable the automation

# titanic dataset contains pre-transformed data
titanic

##       fate    sex    n percent
## 1 perished   male 1364    62.0
## 2 perished female  126     5.7
## 3 survived   male  367    16.7
## 4 survived female  344    15.6

To graph the distribution without transformation, use stat = “identity” in the geom_

# Create a stacked bar chart
p <- ggplot(data = titanic,
            mapping = aes(x = fate, 
                          y = percent,
                          fill = sex)) +
    geom_bar(stat = "identity", position = "dodge") +
    theme(legend.position = "top")
p

GGPlot Part 2

2023-04-04