5.1 Video

Libraries used

library(ggplot2)

Visualizing data

ceo <- readr::read_csv("ceo_salaries.csv")
attach(ceo)

Histograms

hist(ceo$SALARY,main="Histogram of CEO salaries",xlab="Salary bins in $USD")

hist(ceo$AGE)

Determining number of bins and breaks

Specify number of breaks

R takes the specific number as a suggestion
Makes sure that the x axis is in logical intervals

hist(x=cars$speed, breaks=10) # notice that there are 11 bars

Can specify the range, following example shows 4 through 25 possible breaks specified

hist(x=cars$speed, breaks=4:25, main="Car Speed", xlab="Speed (mph)")

By vector

seq() used in the same way in vector form

hist(cars$speed, breaks=seq(4,25, by=1),main="Car Speed", xlab="Speed (mph)")

Labeling bars

hist(x=cars$speed, breaks=10, labels=TRUE, main="Car Speed", xlab="Speed (mph)") 

Density plot

density(x=cars$speed) 
## 
## Call:
##  density.default(x = cars$speed)
## 
## Data: cars$speed (50 obs.);  Bandwidth 'bw' = 2.15
## 
##        x                y            
##  Min.   :-2.450   Min.   :7.999e-05  
##  1st Qu.: 6.025   1st Qu.:5.918e-03  
##  Median :14.500   Median :2.570e-02  
##  Mean   :14.500   Mean   :2.944e-02  
##  3rd Qu.:22.975   3rd Qu.:5.442e-02  
##  Max.   :31.450   Max.   :6.575e-02
cardensity <- density(cars$speed) 
plot(cardensity, xlab="Speed (mph)")

Add a curve

hist(x=cars$speed, prob=TRUE, main="Car speed", col="purple", border="white", xlab="Speed (mph)")
lines(density(cars$speed), lwd=3, col="black")

Box (and whisker) plot

Measures Same as scatter plot, but as boxplot()

  • Range
  • Median
  • Interquartile range As well as spread and skewness

horizontal= takes TRUE or FALSE

boxplot(ceo$SALARY, main="Boxplot showing the median and range of CEO salaries", frame.plot=FALSE) # remove box around graph

Scatter plots

main="" chart title
type= type of chart

  • p scatter plots (default)
  • do not need to explicitly include it as a parameter)
  • l lines,
  • b both lines and points,
  • h histogram/high-density
  • s stair steps,
  • n no plotting.
    xlab= Label for x-axis
    ylab= Label for y-axis
plot(AGE, SALARY, main="Does salary increase with age?", xlab="Age",ylab="Salary in thousands of $USD",frame.plot=FALSE)

ggplot

Density

ggplot(ceo, aes(x=SALARY)) + geom_density(fill="#4cbe3a")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

Histogram

ggplot(ceo, aes(x=SALARY))+geom_histogram(binwidth=100, fill = "#4cbea3")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Scatter plot

Note: geom_point() will require x AND y in the aes()

ggplot(ceo, aes(x=SALARY,AGE))+geom_point(color="blue")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Bar graph

ggplot(ceo, aes(AGE)) + geom_bar(fill="blue")

Data formatting

Changing column names

head(ceo,5)
## # A tibble: 5 × 2
##     AGE SALARY
##   <dbl>  <dbl>
## 1    53    145
## 2    43    621
## 3    33    262
## 4    45    208
## 5    46    362
names(ceo) <- c(tolower("AGE"), tolower("SALARY"))
head(ceo,5)
## # A tibble: 5 × 2
##     age salary
##   <dbl>  <dbl>
## 1    53    145
## 2    43    621
## 3    33    262
## 4    45    208
## 5    46    362

Reorder columns

head(ceo,5)
## # A tibble: 5 × 2
##     age salary
##   <dbl>  <dbl>
## 1    53    145
## 2    43    621
## 3    33    262
## 4    45    208
## 5    46    362
ceo <- ceo[c("salary","age")]

Notice that the access brackets are used instead. It makes sense to use them since we’re telling the system to create a table with those columns

head(ceo,5)
## # A tibble: 5 × 2
##   salary   age
##    <dbl> <dbl>
## 1    145    53
## 2    621    43
## 3    262    33
## 4    208    45
## 5    362    46

Tables

table() makes a frequency table

table(AGE)
## AGE
## 32 33 36 37 38 40 41 43 44 45 46 47 48 49 50 51 52 53 55 56 57 58 59 60 61 62 
##  1  1  1  1  1  1  1  2  2  4  2  3  4  1  6  2  1  3  3  4  2  2  1  1  3  2 
## 63 69 70 74 
##  1  2  1  1

Showing two plots at one time (scatter + box)

chartcolor <- "blue"

hist_salary <- ggplot(ceo, aes(salary)) + 
  geom_histogram(binwidth = 100, fill=chartcolor) +
  labs(title = "Histogram of CEO salaries",
       caption = "Kristen Sosulski | Source: Statcrunch (2019)",
       x = "Salary in thousands of $USD", y = "Frequency")

hist_age <- ggplot(ceo, aes(age)) +
  geom_histogram(binwidth = 100, fill=chartcolor) +
  labs(title ="Histogram of CEO ages",
       caption = "Kristen Sosulski | Source: Statcrunch (2019)",
       x = "Age (in years)", y = "Frequency")

Introducing cowplot!

cowplot::plot_grid(hist_age, hist_salary, labels = "AUTO")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

Another example

library(car) # DO NOT CONFUSE WITH DATASET CAR
## Loading required package: carData
scatterplot(x=cars$speed, y=cars$dist, xlab="Speed (mph)", ylab="Stopping distance(ft)", main="Cars: Speed and stopping distance", smooth=FALSE)

Formatting numbers

hist_salary <- hist_salary + scale_x_continuous(labels=scales::comma) #adds comma to thousands
cowplot::plot_grid(hist_age, hist_salary, labels = "AUTO")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

But there are some values still cut off. Solution: make the graph wider

hist_salary <- hist_salary + scale_x_continuous(labels=scales::comma,
                                                limits=c(min(ceo$salary), 1500))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.

5.2 Follow along

Ordering factors

Preparing data:

undergrad <- readr::read_csv("undergrad.csv")
undergrad <- data.frame(undergrad)
#renaming columns in the undergrad data frame
names(undergrad) <- c("timestamp","excel","access", "statistics", "programming", "iscourse", "cscourse", "topics", "istopics", "onlinecourse", "concentration")
attach(undergrad)
hist(access) 
## Error in hist.default(access): 'x' must be numeric
#reassign access as a factor variable
access <-as.factor(access) 
# cast access as a numeric for plotting. 
figure06<- hist(as.numeric(access), main = "Responses to the level of importance of learning Microsoft Access", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")

Calling the figure06 will output the details of the graph

figure06
## $breaks
## [1] 1 2 3 4 5 6
## 
## $counts
## [1] 10  9 10  2  8
## 
## $density
## [1] 0.25641026 0.23076923 0.25641026 0.05128205 0.20512821
## 
## $mids
## [1] 1.5 2.5 3.5 4.5 5.5
## 
## $xname
## [1] "as.numeric(access)"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
access_ordered <- ordered(x=access, levels=c("Strongly disagree","Disagree","Somewhat disagree","Neither agree or disagree","Somewhat agree","Agree","Strongly Agree"))

attributes(access_ordered)
## $levels
## [1] "Strongly disagree"         "Disagree"                 
## [3] "Somewhat disagree"         "Neither agree or disagree"
## [5] "Somewhat agree"            "Agree"                    
## [7] "Strongly Agree"           
## 
## $class
## [1] "ordered" "factor"
table(access_ordered)
## access_ordered
##         Strongly disagree                  Disagree         Somewhat disagree 
##                         0                         5                         2 
## Neither agree or disagree            Somewhat agree                     Agree 
##                         9                        10                         5 
##            Strongly Agree 
##                         8
figure07 <- hist(as.numeric(access_ordered),breaks=7, main = "Responses to the level of importance of learning Microsoft Access", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")

figure07
## $breaks
## [1] 2 3 4 5 6 7
## 
## $counts
## [1]  7  9 10  5  8
## 
## $density
## [1] 0.1794872 0.2307692 0.2564103 0.1282051 0.2051282
## 
## $mids
## [1] 2.5 3.5 4.5 5.5 6.5
## 
## $xname
## [1] "as.numeric(access_ordered)"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
par(mfrow = c(1, 2))

hist(as.numeric(access), main = "Figure 6", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")

hist(as.numeric(access_ordered),breaks=7, main = "Figure 7", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")

detach(undergrad)

Data visualization

Data visualization ggplot2 cheat sheet

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
penguins <- palmerpenguins::penguins

penguins
## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
#View(penguins)
dplyr::glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species           <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island            <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm     <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g       <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex               <fct> male, female, female, NA, female, male, female, male…
## $ year              <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…

Creating a ggplot

1) Create ggplot object

library(ggplot2)
ggplot(data = penguins)

2) Mapping visual properties (aesthetics)

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
)

3) Add the points/graphs

geom_bar() bar charts
geom_line() line chart
geom_boxplot() boxplot
geom_point() scatterplot

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point() # the graph
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Notice the warning. “there are two penguins in our dataset with missing body mass and/or flipper length values and ggplot2 has no way of representing them on the plot without both of these values” but R calls it out so that we know that there were some taken out

Aesthetics and Layers

aes()

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
  geom_point() # the graph
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Scaling

When a categorical variable is mapped to an aesthetic, ggplot2 will automatically assign a unique value of the aesthetic (here a unique color) to each unique level of the variable (each of the three species)
Translated: Color coding happens on its own!

Adding another layer:

  • smooth curve geom_smooth
  • line of best fit based on linear model method="lm"
ggplot( 
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) + 
  geom_point() + # the graph
  geom_smooth(method = "lm") # the other layer
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Local vs global mapping

ggplot( # global level
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) + # everything after this is local
  geom_point(mapping = aes(color = species)) +
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Take into consideration color blindness, so also use shapes

ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(mapping = aes(color = species, shape = species)) + # here
  geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Labels

library(ggthemes)
ggplot(
  data = penguins,
  mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
  geom_point(aes(color = species, shape = species)) +
  geom_smooth(method = "lm") +
  labs(
    title = "Body mass and flipper length",
    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
    x = "Flipper length (mm)", y = "Body mass (g)",
    color = "Species", shape = "Species"
  ) +
  ggthemes::scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Conciseness

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

and

penguins |> 
  ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + 
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Visualizing distributions

Categorical variable

can only take one of a small set of values
Example: bar chart

ggplot(penguins, aes(x = species)) +
  geom_bar()

In bar plots of categorical variables with non-ordered levels, like the penguin species above, it’s often preferable to reorder the bars based on their frequencies. Doing so requires transforming the variable to a factor (how R handles categorical data) and then reordering the levels of that factor.

ggplot(penguins, aes(x = fct_infreq(species))) +
  geom_bar()

Numerical variable

can take on a wide range of numerical values, and it is sensible to add, subtract, or take averages with those values. Numerical variables can be continuous or discrete
Example: histogram

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 200)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

Additional bins

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 20)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

ggplot(penguins, aes(x = body_mass_g)) +
  geom_histogram(binwidth = 2000)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

Density plots

ggplot(penguins, aes(x = body_mass_g)) +
  geom_density()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

Visualizing relationships

Numerical and categorical

Box plots

  • A box that indicates the range of the middle half of the data, a distance known as the interquartile range (IQR), stretching from the 25th percentile of the distribution to the 75th percentile. In the middle of the box is a line that displays the median, i.e. 50th percentile, of the distribution. These three lines give you a sense of the spread of the distribution and whether or not the distribution is symmetric about the median or skewed to one side. Visual points that display observations that fall more than 1.5 times the IQR from either edge of the box. These outlying points are unusual so are plotted individually. A line (or whisker) that extends from each end of the box and goes to the farthest non-outlier point in the distribution.
Explanation of box plot
Explanation of box plot

Example: distribution of body mass

ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Example: same but with density plot

ggplot(penguins, aes(x = body_mass_g, color = species)) +
  geom_density(linewidth = 0.75)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

Prettified

ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
  geom_density(alpha = 0.5)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

Two categorical

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar()

in a different way position="fill"
more useful for comparing species distributions across islands since it’s not affected by the unequal numbers of penguins across the islands

ggplot(penguins, aes(x = island, fill = species)) +
  geom_bar(position = "fill")

Two numerical

scatterplot is probably the most commonly used plot for visualizing the relationship between two numerical variables

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Three or more variables

Example: the colors of points represent species and the shapes of points represent islands

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Facets

subplots that each display one subset of the data.

ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point(aes(color = species, shape = species)) +
  facet_wrap(~island)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Saving plots

Example: ggsave(filename = "penguin-plot.png")

Bring points forward

stackoverflow