–metadata title=“R for Data Science”
set up
download the package and data
if(!require(tidyverse))
{install.packages("tidyverse")}
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
# color_point
ggplot(mpg, aes(x = displ, y = hwy, color = class)) +
geom_point()
# shape_point
ggplot(mpg, aes(x = displ, y = hwy, shape = class)) +
geom_point()
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
## that many of them.
## Warning: Removed 62 rows containing missing values or values outside the scale range
## (`geom_point()`).
# size_point
ggplot(mpg, aes(x = displ, y = hwy, size = class)) +
geom_point()
## Warning: Using size for a discrete variable is not advised.
#> Warning: Using size for a discrete variable is not advised.
# alpha_point
ggplot(mpg, aes(x = displ, y = hwy, alpha = class)) +
geom_point()
## Warning: Using alpha for a discrete variable is not advised.
#> Warning: Using alpha for a discrete variable is not advised.
# outside the aes
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(colour = "yellow", fill = "red", size = 2, shape = 21)
Create a scatterplot of hwy vs. displ where the points are pink filled in triangles.
Why did the following code not result in a plot with blue points?
ggplot(mpg) + geom_point(aes(x = displ, y = hwy, color = “blue”))
In ggplot2, the stroke aesthetic controls the width (thickness) of a point’s border, measured in mm. It functions similarly to the linewidth aesthetic used for lines and polygons, allowing you to distinguish the thickness of an outline from the overall size of the shape
#1
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(shape = 17, colour = "pink")
#2
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy, color = "blue"))
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy), color = "blue")
#3
ggplot(mpg, aes(displ, hwy)) +
geom_point(shape = 21, # Circle with fill
fill = "red", # Interior color
colour = "black", # Border color
size = 5, # Total interior size
stroke = 2, # Thickness of the black border
alpha = 0.5)
#4
ggplot(mpg, aes(x = displ, y = hwy, color = displ < 5)) +
geom_point(shape = 17)
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point(aes(shape = drv)) +
geom_smooth(aes(linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# a single line
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# group aesthetic by itself does not add a legend or distinguishing features to the geom
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(group = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(color = drv, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# suppress the legend and
#When you set "se" to FALSE, you are telling R to hide that grey ribbon
# and only display the smoothed line itself.
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_smooth(aes(color = drv, linetype = drv),
show.legend = FALSE, se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# local layer aes won't affect other layers
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# highling subset of data, specifying the colour
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue") +
geom_point(
data = mpg |> filter(class == "2seater"),
color = "red"
) +
geom_point(
data = mpg |> filter(class == "2seater"),
shape = "circle open", size = 3, color = "red"
)
# using another method but there is no control of the colours
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(aes(color = class == "2seater"))
# histogram
ggplot(mpg, aes(x = hwy)) +
geom_histogram(binwidth = 2)
# density
ggplot(mpg, aes(x = hwy)) +
geom_density()
# boxplot
ggplot(mpg, aes(x = hwy)) +
geom_boxplot()
### 9.3.1
Exercises
In ggplot2, geometric objects (geoms) determine the visual representation of your data. To draw the specified charts, you would use the following functions: - Line Chart: geom_line(). It connects data points from left to right to display trends, often over time. - Boxplot: geom_boxplot(). It visualizes the distribution of data through a five-number summary (minimum, first quartile, median, third quartile, and maximum). - Histogram: geom_histogram(). It displays the distribution of a continuous variable by “binning” values into ranges and counting the number of observations in each. - Area Chart: geom_area(). It is similar to a line chart but fills the area between the line and the x-axis with color or shading
#Line Chart
mpg |>
group_by(year) |>
summarise(mean = mean(displ)) |>
ggplot(aes(x = year, y = mean)) +
geom_line()+
geom_point()
#Boxplot
ggplot(mpg,
aes(x = displ, colour = as.factor(year)))+
geom_boxplot()+
labs(colour = "Year")
#histogram
ggplot(mpg,
aes(x = hwy, fill = cty > 15)) +
geom_histogram(bins = 20)
#area chart
mpg |>
group_by(year) |>
summarise(mean = mean(displ)) |>
ggplot(aes(x = year, y = mean)) +
geom_area(fill = "green", alpha = 0.5)
ggplot(mpg, aes(x = displ, y = hwy)) + geom_smooth(aes(color = drv), show.legend = FALSE)
What does show.legend = FALSE do here? What happens if you remove it? Why do you think we used it earlier?
Stop showing the legend.
What does the se argument to geom_smooth() do?
In ggplot2, when you set “se” to FALSE, you are telling R to hide that
grey ribbon . The grey ribbon drawn by geom_smooth() represents the
confidence interval around the smoothed line. Statistical Meaning: By
default, it displays a 95% confidence interval for the smoothed mean.
This means that if you were to repeat the sampling process many times,
the true underlying trend would fall within that shaded region 95% of
the time.
Recreate the R code necessary to generate the following graphs. Note that wherever a categorical variable is used in the plot, it’s drv.
ggplot(mpg,
aes(x = displ,
y = hwy)) +
geom_point(size = 2) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg,
aes(x = displ,
y = hwy)) +
geom_point(size = 2) +
geom_smooth(
se = FALSE,
aes(group = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg,
aes(x = displ,
y = hwy,
colour = drv)) +
geom_point(size = 2) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg,
aes(x = displ,
y = hwy)) +
geom_point(aes(colour = drv), size = 2) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg,
aes(x = displ,
y = hwy)) +
geom_point(aes(colour = drv), size = 2) +
geom_smooth(aes(linetype = drv), se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(mpg,
aes(x = displ,
y = hwy)) +
geom_point(shape = 1, size =2, colour = "white", stroke = 2)+
geom_point(aes(colour = drv), size = 2)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~cyl)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl) # formula: rows ~ cols
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl, scales = "free")
# free scales
#"free_x" will allow for different scales of x-axis across columns
# "free_y" will allow for different scales of x-axis across rows
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl, scales = "free_y")
#create a separate facet for every unique value in that dataset
ggplot(mpg,
aes(x = displ, y = hwy)) +
geom_point()+
facet_wrap(~cyl)
ggplot(mpg,
aes(x = displ, y = hwy)) +
geom_point()+
facet_wrap(~cty)
length(unique(mpg$cty))
## [1] 21
ggplot(mpg,
aes(x = cty, y = hwy)) +
geom_point()+
facet_wrap(~displ)
length(unique(mpg$displ))
## [1] 35
#use cut to transform your numbers into categories like
ggplot(mpg,
aes(x = cty, y = hwy)) +
geom_point()+
facet_wrap(~ cut_width(displ, 2))
a graph for each value of the continuous variable: create a separate
facet for every unique value in that dataset
cut_interval() Divides data into a fixed number of bins (e.g., exactly 5 bins). cut_width() Divides data by a fixed width (e.g., a bin for every 10 units). cut_number() Divides data so each facet has roughly the same number of data points. cut() Standard R function for custom, manual break points.
ggplot(mpg) + geom_point(aes(x = drv, y = cyl))
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl)
ggplot(mpg) +
geom_point(aes(x = drv, y = cyl))
3. What plots does the following code make? What does . do?
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(drv ~ .) #row
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(. ~ cyl) # column
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~drv) #column
4. Take the first faceted plot in this section:
ggplot(mpg) + geom_point(aes(x = displ, y = hwy)) + facet_wrap(~ cyl, nrow = 2)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~ cyl, nrow = 2)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy, colour = cyl))
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy, colour = as.factor(cyl)))
What are the advantages to using faceting instead of the color
aesthetic? What are the disadvantages? How might the balance change if
you had a larger dataset?
p <- ggplot(mpg, aes(displ, hwy)) + geom_point()
# Use vars() to supply faceting variables:
p + facet_wrap(vars(class))
# Control the number of rows and columns with nrow and ncol
p + facet_wrap(vars(class), nrow = 4)
p + facet_wrap(vars(class), ncol = 4)
# You can facet by multiple variables
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
facet_wrap(vars(cyl, drv))
# Use the `labeller` option to control how labels are printed:
ggplot(mpg, aes(displ, hwy)) +
geom_point() +
facet_wrap(vars(cyl, drv), labeller = "label_both")
# Use of facet_grid
p + facet_grid(cyl ~ drv, , labeller = "label_both")# row: cyl, column: drv
p + facet_grid(drv ~ cyl, , labeller = "label_both") # row: drv, column: cyl
ggplot(mpg, aes(x = displ)) +
geom_histogram() +
facet_grid(drv ~ .)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
ggplot(mpg, aes(x = displ)) +
geom_histogram() +
facet_grid(. ~ drv)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
7. Recreate the following plot using facet_wrap() instead of
facet_grid(). How do the positions of the facet labels change?
ggplot(mpg) + geom_point(aes(x = displ, y = hwy)) + facet_grid(drv ~ .)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_grid(drv ~ .)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~drv)
ggplot(mpg) +
geom_point(aes(x = displ, y = hwy)) +
facet_wrap(~drv, ncol = 1)
What is the default geom associated with stat_summary()? How could you rewrite the previous plot to use that geom function instead of the stat function?
What does geom_col() do? How is it different from geom_bar()?
g <- ggplot(mpg, aes(class))
g + geom_bar() # y: frequency count of X
ggplot(mpg, aes(x = class, y = hwy)) + # both x and y are variables of the data
geom_col(aes(fill = drv))
ggplot(mpg, aes(x = class, y = after_stat(prop), group = 1)) + #stat: using prop instead of count
geom_bar()
g + geom_bar(aes(y = after_stat(prop), group = 1)) #stat: using prop instead of count with local mapping
Most geoms and stats come in pairs that are almost always used in concert. Make a list of all the pairs. What do they have in common? (Hint: Read through the documentation.) Visual Goal,Geom Function,Stat Function Bar Chart (counting),geom_bar(),stat_count() Histogram,geom_histogram(),stat_bin() Frequency Polygon,geom_freqpoly(),stat_bin() Boxplot,geom_boxplot(),stat_boxplot() Density Plot,geom_density(),stat_density() Smooth Trend Line,geom_smooth(),stat_smooth() Violin Plot,geom_violin(),stat_ydensity() 2D Heatmap (Squares),geom_bin2d(),stat_bin2d() 2D Heatmap (Hexagons),geom_hex(),stat_binhex() Quantile Regression,geom_quantile(),stat_quantile() Contours,geom_contour(),stat_contour() Point Counts,geom_count(),stat_sum() QQ Plot,geom_qq(),stat_qq()
What variables does stat_smooth() compute? What arguments control its behavior?
In our proportion bar chart, we needed to set group = 1. Why? In other words, what is the problem with these two graphs?
ggplot(diamonds, aes(x = cut, y = after_stat(prop))) + geom_bar() ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) + geom_bar()
ggplot(diamonds, aes(x = cut)) + #use the default after_stat(count)
geom_bar()
ggplot(diamonds, aes(x = cut, y = after_stat(prop))) +
geom_bar()
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) +
geom_bar()
ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) +
geom_bar()
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = 1)) +
geom_bar() #group = 1: Denominator is the whole dataset (but you lose fill)
## Warning: The following aesthetics were dropped during statistical transformation: fill.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop), group = color)) +
geom_bar() # group = color: Denominator is the count within each color.
ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(count/sum(count)))) +
geom_bar()
#show the proportion of each cut/color combination relative to the entire dataset (not just within its own color), you should use after_stat(count/sum(count))
ggplot(mpg, aes(x = drv, color = drv)) +
geom_bar()
ggplot(mpg, aes(x = drv, fill = drv)) +
geom_bar()
#map the fill aesthetic to another variable
#default setting of geom_bar(): position = "stack"
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar()
# position = "identity" will place each object exactly where it falls in the context of the graph.
# This is not very useful for bars, because it overlaps them.
#To see that overlapping we either need to make the bars slightly transparent by setting alpha to a small value...
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar(alpha = 1/5, position = "identity")
# or completely transparent by setting fill = NA.
ggplot(mpg, aes(x = drv, color = class)) +
geom_bar(fill = NA, position = "identity")
# position = "fill" works like stacking, but makes each set of stacked bars the same height. This makes it easier to compare proportions across groups.
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar(position = "fill")
# position = "dodge" places overlapping objects directly beside one another. This makes it easier to compare individual values.
ggplot(mpg, aes(x = drv, fill = class)) +
geom_bar(position = "dodge")
#position = "jitter" adds a small amount of random noise to each point.
#This spreads the points out because no two points are likely to receive the same amount of random noise.
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(position = "jitter")
q <- ggplot(mpg, aes(x = cty, y = hwy))
q + geom_point()
q + geom_point(alpha = 0.3)
q + geom_jitter()
q + geom_count()
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point()
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(position = "identity")
# no difference
p <- ggplot(mpg, aes(cyl, hwy))
p + geom_point()
p + geom_jitter()
# Add aesthetic mappings
p + geom_jitter(aes(colour = class))
# Use smaller width/height to emphasise categories
p+ geom_jitter()
p+ geom_jitter(width = 0.15)
p + geom_jitter(height = 0.15)
p + geom_jitter(width = 0.15, height = 0.15)
p + geom_jitter(width = 0.15, height = 0.15, alpha = 0.3)
# Use larger width/height to completely smooth away discreteness
ggplot(mpg, aes(cty, hwy)) +
geom_jitter()
ggplot(mpg, aes(cty, hwy)) +
geom_jitter(width = 0.75, height = 0.75)
4. Compare and contrast geom_jitter() with geom_count().
position = “dodge2”
p <- ggplot(mpg, aes(y = hwy, x = class))
p + geom_boxplot()
p + geom_boxplot(position = "dodge")
p + geom_boxplot(position = "identity")
ggplot(data = mpg, mapping = aes(x = class, y = hwy, color = drv)) +
geom_boxplot() +
labs(title = "Default 'dodge2' Position Adjustment in geom_boxplot()",
x = "Vehicle Class",
y = "Highway MPG",
color = "Drive Type")
ggplot(data = mpg, mapping = aes(x = class, y = hwy, color = drv)) +
geom_boxplot(position = "identity") +
labs(title = " 'identity' Position Adjustment in geom_boxplot()",
x = "Vehicle Class",
y = "Highway MPG",
color = "Drive Type")
ggplot(data = ) +