Libraries used
library(ggplot2)
ceo <- readr::read_csv("ceo_salaries.csv")
attach(ceo)
hist(ceo$SALARY,main="Histogram of CEO salaries",xlab="Salary bins in $USD")
hist(ceo$AGE)
R takes the specific number as a suggestion
Makes sure that the x axis is in logical intervals
hist(x=cars$speed, breaks=10) # notice that there are 11 bars
Can specify the range, following example shows 4 through 25 possible breaks specified
hist(x=cars$speed, breaks=4:25, main="Car Speed", xlab="Speed (mph)")
seq() used in the same way in vector form
hist(cars$speed, breaks=seq(4,25, by=1),main="Car Speed", xlab="Speed (mph)")
hist(x=cars$speed, breaks=10, labels=TRUE, main="Car Speed", xlab="Speed (mph)")
density(x=cars$speed)
##
## Call:
## density.default(x = cars$speed)
##
## Data: cars$speed (50 obs.); Bandwidth 'bw' = 2.15
##
## x y
## Min. :-2.450 Min. :7.999e-05
## 1st Qu.: 6.025 1st Qu.:5.918e-03
## Median :14.500 Median :2.570e-02
## Mean :14.500 Mean :2.944e-02
## 3rd Qu.:22.975 3rd Qu.:5.442e-02
## Max. :31.450 Max. :6.575e-02
cardensity <- density(cars$speed)
plot(cardensity, xlab="Speed (mph)")
Add a curve
hist(x=cars$speed, prob=TRUE, main="Car speed", col="purple", border="white", xlab="Speed (mph)")
lines(density(cars$speed), lwd=3, col="black")
Measures Same as scatter plot, but as boxplot()
horizontal= takes TRUE or
FALSE
boxplot(ceo$SALARY, main="Boxplot showing the median and range of CEO salaries", frame.plot=FALSE) # remove box around graph
main="" chart title
type= type of chart
p scatter plots (default)l lines,b both lines and points,h histogram/high-densitys stair steps,n no plotting.xlab= Label for x-axisylab= Label for y-axisplot(AGE, SALARY, main="Does salary increase with age?", xlab="Age",ylab="Salary in thousands of $USD",frame.plot=FALSE)
ggplot(ceo, aes(x=SALARY)) + geom_density(fill="#4cbe3a")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).
ggplot(ceo, aes(x=SALARY))+geom_histogram(binwidth=100, fill = "#4cbea3")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Note: geom_point() will require x AND
y in the aes()
ggplot(ceo, aes(x=SALARY,AGE))+geom_point(color="blue")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(ceo, aes(AGE)) + geom_bar(fill="blue")
head(ceo,5)
## # A tibble: 5 × 2
## AGE SALARY
## <dbl> <dbl>
## 1 53 145
## 2 43 621
## 3 33 262
## 4 45 208
## 5 46 362
names(ceo) <- c(tolower("AGE"), tolower("SALARY"))
head(ceo,5)
## # A tibble: 5 × 2
## age salary
## <dbl> <dbl>
## 1 53 145
## 2 43 621
## 3 33 262
## 4 45 208
## 5 46 362
head(ceo,5)
## # A tibble: 5 × 2
## age salary
## <dbl> <dbl>
## 1 53 145
## 2 43 621
## 3 33 262
## 4 45 208
## 5 46 362
ceo <- ceo[c("salary","age")]
Notice that the access brackets are used instead. It makes sense to use them since we’re telling the system to create a table with those columns
head(ceo,5)
## # A tibble: 5 × 2
## salary age
## <dbl> <dbl>
## 1 145 53
## 2 621 43
## 3 262 33
## 4 208 45
## 5 362 46
table() makes a frequency table
table(AGE)
## AGE
## 32 33 36 37 38 40 41 43 44 45 46 47 48 49 50 51 52 53 55 56 57 58 59 60 61 62
## 1 1 1 1 1 1 1 2 2 4 2 3 4 1 6 2 1 3 3 4 2 2 1 1 3 2
## 63 69 70 74
## 1 2 1 1
chartcolor <- "blue"
hist_salary <- ggplot(ceo, aes(salary)) +
geom_histogram(binwidth = 100, fill=chartcolor) +
labs(title = "Histogram of CEO salaries",
caption = "Kristen Sosulski | Source: Statcrunch (2019)",
x = "Salary in thousands of $USD", y = "Frequency")
hist_age <- ggplot(ceo, aes(age)) +
geom_histogram(binwidth = 100, fill=chartcolor) +
labs(title ="Histogram of CEO ages",
caption = "Kristen Sosulski | Source: Statcrunch (2019)",
x = "Age (in years)", y = "Frequency")
Introducing cowplot!
cowplot::plot_grid(hist_age, hist_salary, labels = "AUTO")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
Another example
library(car) # DO NOT CONFUSE WITH DATASET CAR
## Loading required package: carData
scatterplot(x=cars$speed, y=cars$dist, xlab="Speed (mph)", ylab="Stopping distance(ft)", main="Cars: Speed and stopping distance", smooth=FALSE)
hist_salary <- hist_salary + scale_x_continuous(labels=scales::comma) #adds comma to thousands
cowplot::plot_grid(hist_age, hist_salary, labels = "AUTO")
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
But there are some values still cut off. Solution: make the graph wider
hist_salary <- hist_salary + scale_x_continuous(labels=scales::comma,
limits=c(min(ceo$salary), 1500))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
Preparing data:
undergrad <- readr::read_csv("undergrad.csv")
undergrad <- data.frame(undergrad)
#renaming columns in the undergrad data frame
names(undergrad) <- c("timestamp","excel","access", "statistics", "programming", "iscourse", "cscourse", "topics", "istopics", "onlinecourse", "concentration")
attach(undergrad)
hist(access)
## Error in hist.default(access): 'x' must be numeric
#reassign access as a factor variable
access <-as.factor(access)
# cast access as a numeric for plotting.
figure06<- hist(as.numeric(access), main = "Responses to the level of importance of learning Microsoft Access", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")
Calling the figure06 will output the details of the
graph
figure06
## $breaks
## [1] 1 2 3 4 5 6
##
## $counts
## [1] 10 9 10 2 8
##
## $density
## [1] 0.25641026 0.23076923 0.25641026 0.05128205 0.20512821
##
## $mids
## [1] 1.5 2.5 3.5 4.5 5.5
##
## $xname
## [1] "as.numeric(access)"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
access_ordered <- ordered(x=access, levels=c("Strongly disagree","Disagree","Somewhat disagree","Neither agree or disagree","Somewhat agree","Agree","Strongly Agree"))
attributes(access_ordered)
## $levels
## [1] "Strongly disagree" "Disagree"
## [3] "Somewhat disagree" "Neither agree or disagree"
## [5] "Somewhat agree" "Agree"
## [7] "Strongly Agree"
##
## $class
## [1] "ordered" "factor"
table(access_ordered)
## access_ordered
## Strongly disagree Disagree Somewhat disagree
## 0 5 2
## Neither agree or disagree Somewhat agree Agree
## 9 10 5
## Strongly Agree
## 8
figure07 <- hist(as.numeric(access_ordered),breaks=7, main = "Responses to the level of importance of learning Microsoft Access", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")
figure07
## $breaks
## [1] 2 3 4 5 6 7
##
## $counts
## [1] 7 9 10 5 8
##
## $density
## [1] 0.1794872 0.2307692 0.2564103 0.1282051 0.2051282
##
## $mids
## [1] 2.5 3.5 4.5 5.5 6.5
##
## $xname
## [1] "as.numeric(access_ordered)"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
par(mfrow = c(1, 2))
hist(as.numeric(access), main = "Figure 6", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")
hist(as.numeric(access_ordered),breaks=7, main = "Figure 7", xlab = "Bins by reponse category", col="#4cbea3", labels=TRUE, border="#FFFFFF")
detach(undergrad)
Data visualization ggplot2 cheat sheet
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
penguins <- palmerpenguins::penguins
penguins
## # A tibble: 344 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
#View(penguins)
dplyr::glimpse(penguins)
## Rows: 344
## Columns: 8
## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Adelie, Adel…
## $ island <fct> Torgersen, Torgersen, Torgersen, Torgersen, Torgerse…
## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.2, 34.1, …
## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.6, 18.1, …
## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195, 193, 190, 186…
## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 4675, 3475, …
## $ sex <fct> male, female, female, NA, female, male, female, male…
## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007…
library(ggplot2)
ggplot(data = penguins)
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
)
geom_bar() bar charts
geom_line() line chart
geom_boxplot() boxplot
geom_point() scatterplot
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point() # the graph
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Notice the warning. “there are two penguins in our dataset with missing body mass and/or flipper length values and ggplot2 has no way of representing them on the plot without both of these values” but R calls it out so that we know that there were some taken out
aes()
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
geom_point() # the graph
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
When a categorical variable is mapped to an aesthetic, ggplot2 will
automatically assign a unique value of the aesthetic (here a unique
color) to each unique level of the variable (each of the three
species)
Translated: Color coding happens on its own!
geom_smoothmethod="lm"ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
) +
geom_point() + # the graph
geom_smooth(method = "lm") # the other layer
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot( # global level
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) + # everything after this is local
geom_point(mapping = aes(color = species)) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Take into consideration color blindness, so also use shapes
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(mapping = aes(color = species, shape = species)) + # here
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
library(ggthemes)
ggplot(
data = penguins,
mapping = aes(x = flipper_length_mm, y = body_mass_g)
) +
geom_point(aes(color = species, shape = species)) +
geom_smooth(method = "lm") +
labs(
title = "Body mass and flipper length",
subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
x = "Flipper length (mm)", y = "Body mass (g)",
color = "Species", shape = "Species"
) +
ggthemes::scale_color_colorblind()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
and
penguins |>
ggplot(aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
can only take one of a small set of values
Example: bar chart
ggplot(penguins, aes(x = species)) +
geom_bar()
In bar plots of categorical variables with non-ordered levels, like the penguin species above, it’s often preferable to reorder the bars based on their frequencies. Doing so requires transforming the variable to a factor (how R handles categorical data) and then reordering the levels of that factor.
ggplot(penguins, aes(x = fct_infreq(species))) +
geom_bar()
can take on a wide range of numerical values, and it is sensible to
add, subtract, or take averages with those values. Numerical variables
can be continuous or discrete
Example: histogram
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 200)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
Additional bins
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 20)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
ggplot(penguins, aes(x = body_mass_g)) +
geom_histogram(binwidth = 2000)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
Density plots
ggplot(penguins, aes(x = body_mass_g)) +
geom_density()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
Box plots
Example: distribution of body mass
ggplot(penguins, aes(x = species, y = body_mass_g)) +
geom_boxplot()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Example: same but with density plot
ggplot(penguins, aes(x = body_mass_g, color = species)) +
geom_density(linewidth = 0.75)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
Prettified
ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
geom_density(alpha = 0.5)
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar()
in a different way position="fill"
more useful for comparing species distributions across islands since
it’s not affected by the unequal numbers of penguins across the
islands
ggplot(penguins, aes(x = island, fill = species)) +
geom_bar(position = "fill")
scatterplot is probably the most commonly used plot for visualizing the relationship between two numerical variables
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Example: the colors of points represent species and the shapes of points represent islands
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = species, shape = island))
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
subplots that each display one subset of the data.
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = species, shape = species)) +
facet_wrap(~island)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Example: ggsave(filename = "penguin-plot.png")