library(tidyverse)
library(highcharter)
library(leaflet)
library(sf)
<- read_csv("household_debt.csv") # for a quant data example with percentages
debt <- read_csv("hateCrimes2010.csv") # for a quantitative and somewhat categorical data example
crimes <- read_csv("centralparksquirrelcensus.csv") # for a categorical example that requires a ton of cleaning, and has GIS features
squirinfo <- read_csv("centralpartsquirrel_hectare.csv") squirhectare
r notes
to add:
☙ pivot_long and pivot_wide
☙ theme() customization
☙ substr()
☙ mutate with case_when (more examples of how the format works)
☙ absolute value
☙ time/date data, lubridate
☙ stacking plots (just slightly more)
☙ quarto/markdown customiation, adjusting the html and css, how to make a dashboard, shiny: maybe just make a whole new document for this
PIPING: done with |>
or %>%
+
is only used in ggplot to connect parts of your code together
$
refers to column
!
means NOT (!=
) NOT EQUAL
=
is the same as <-
==
equal to this value
>=
greater than or equal to
NaN
not a number NA
NA
Add message = FALSE
or results = FALSE
right after {r
to make it just show the code and not the output in your rendered document. can use this to hide the messages but only show your graph.
type echo = FALSE
to only print the output and hide the code. can use this to hide your code and just show your plot, but it might also include additional messages (not clear)
type include = FALSE
to totally hide everything
sometimes you still see crap under your chunk after all of this, in which case add warning = FALSE
⭑ for extra insurance you can add this to your yaml:
execute:
message: false
Knitr/render/rpubs:
If you experience errors when knitting/rendering:
1. Clean global environment
2. Run each chunk from the beginning one by one
3. Make sure there’s a space after any the ##)
4. Make sure the yaml is normal
5. Make sure you actually loaded all the libraries in the correct order! (it’s this more than you think)
Always use read_csv (from tidyverse), and never read.csv
# adjust column names/titles
names(crimes) <- tolower(names(crimes))
names(crimes) <- gsub(" ","",names(crimes))
names(crimes) <- gsub("-","",names(crimes)) # remove troublesome symbols
names(squirinfo) <- tolower(names(squirinfo))
names(squirinfo) <- gsub(" ","",names(squirinfo))
names(squirhectare) <- tolower(names(squirhectare))
names(squirhectare) <- gsub(" ","",names(squirhectare))
<- left_join(squirhectare, squirinfo) # it joined by hectare
squircensus
invisible(head(crimes)) # use invisible() before a command in QMD or RMD to make it not print into the output
Using base R, tidyverse
Remove rows/columns
- rows
<- squircensus[-c(3:66, 68:87, 90:757, 760:1000, 1001:2536, 2540:3102), ] brokensquirrel
- columns (
select
)
# remove columns
<- brokensquirrel |> select(-sighterobservedweatherdata, -litternotes, -otheranimalsightings, -hectareconditions)
brokensquirrel # select ONLY columns
<- brokensquirrel |> select(litter, totaltimeofsighting, otheractivities, otherinteractions)
brokensquirrel brokensquirrel
# A tibble: 10 × 4
litter totaltimeofsighting otheractivities otherinteractions
<chr> <dbl> <chr> <chr>
1 Some 22 <NA> <NA>
2 Some 22 <NA> <NA>
3 None 20 <NA> <NA>
4 Abundant 25 <NA> couldn't get near
5 Abundant 25 <NA> <NA>
6 None 25 <NA> <NA>
7 Some 30 <NA> <NA>
8 Some 25 <NA> <NA>
9 Some 25 jumping stared at me
10 Some 25 laid down on the branch <NA>
# only 4 columns - and only 10 rows, because of previous chunk
supposedly can also do this: table(dat$Marital_status, dat$approval_status)
(source https://www.pluralsight.com/resources/blog/guides/testing-for-relationships-between-categorical-variables-using-the-chi-square-test)
Filter pieces of text - using grep
ex: select only squirrels with “mushroom” in their other notes
<- subset(squircensus, grepl("mushroom", squircensus$otheractivities))
fungallyinclinedsquirrels fungallyinclinedsquirrels
grepl
:
Strip down to just numbers/similar
ex: transform messy weather data into just temperature
# turning the "sighterobservedweatherdata" column into just "temperature"
<- squircensus |>
squircensus mutate(temperature = gsub("\\D", "", sighterobservedweatherdata)) #"\\D", "", does , uh, idk, it filled out blank but the next one fixed it
$temperature <- as.numeric(as.character(squircensus$temperature))
squircensus# some were input "~72-73", so:
<- squircensus |> filter(temperature <99) # now that I think about it this just removed those columns an didn't fix them lmao squircensus
Filter for repetition - using n()
Select only rows with duplicates (repeats in the column specified in group_by
)
<-
reptemps |> group_by(temperature) |> filter(n() >1) # group_by then pipe means search just this column, n() is number squircensus
group_by
, summarise
, n()
Use group_by()
on its own or pair with summarise()
# something
Make new column with mutate
<- crimes |>
crimes mutate(thecrime = (antiwhite*anticatholic)*1000000000000,
meanreligion = mean((antijewish & anticatholic)),
)# I don't know why you would do this but you can, you can do pretty much anything
Combine columns with mutate
, case_when
, or if_else
<- squirinfo |>
squirinfo mutate(squirrel = case_when(
!is.na(hectaresquirrelnumber) & !is.na(age) ~
paste(hectaresquirrelnumber, age),
TRUE ~ NA_character_
))# you need case_when() if you want to do something like this, making it return NA in the new column when any (or specifics) of the called-upon columns are NA
<- squirinfo |>
squirinfo mutate(squirrel = if_else(
!is.na(hectaresquirrelnumber) & !is.na(age),
paste(hectaresquirrelnumber, age),
NA_character_
))# these do the same thing in this instance, but case_when is more versatile or something (can handle more than just two thingies. also the fallback is written differently at the end there.)
Make a comparison column with mutate
<- crimes |>
crimes mutate(antitransvsgay = antigaymale - antitransgender)
Filter for absolute value
|> filter(abs(antitransvsgay) > 0.5)
crimes # it has to be written exactly like this: filter(abs(value) > #)
Renaming with mutate
and case_when
(also NAs)
Categorical/character/qualitative:
# source: failed attempt plugged into ChatGPT
<- crimes|>
crimes mutate(town = case_when(
%in% c("Albany", "Allegany") ~ "A towns",
county %in% c("Bronx", "Broome") ~ "B towns",
county %in% c("Cattaraugus", "Cayuga", "Chautauqua", "Chemung", "Chenango", "Clinton", "Columbia", "Cortland") ~ "C Towns",
county %in% c("Dutchess") ~ "B Towns",
county TRUE ~ as.character(county)
))
Numeric/quantitative:
<- crimes |> mutate(offenderconcentration = case_when(
crimes == 0 ~ "None",
totaloffenders >= 1 & totaloffenders <= 5 ~ "Very low",
totaloffenders >= 5 & totaloffenders <= 15 ~ "Low",
totaloffenders >= 15 & totaloffenders <= 35 ~ "Medium",
totaloffenders >= 35 & totaloffenders <= 50 ~ "Moderate",
totaloffenders >= 50 & totaloffenders <= 70 ~ "High",
totaloffenders >= 70 & totaloffenders <= 90 ~ "Super high",
totaloffenders >= 90 & totaloffenders <= 200 ~ "Serious issues",
totaloffenders TRUE ~ NA_character_
))
TRUE ~
at the end defines what to do with data that don’t match what’s specified by case_when
. TRUE ~ NA_character_
fills it with NA, TRUE ~ as.character(column)
has it fill in exactly what it said in the column, and TRUE ~ "whatever"
has it print what you specify in the quotes. if you don’t include this at all it’ll fill in NA
If_else vs case_when
case_when()
and if_else()
(a tidyverse version of ifelse()
) are very similar
if_else()
is the fastest (?)
if_else()
only uses 2 values, one if and one else.case_when()
can use more than 2, of any data type
supposedly one of the issues with
ifelse()
is that TRUE/FALSE values confuse it
it would appear that when you use
case_when
you useTRUE ~ ...
as your fallback, while withif_else
you write whatever you were going to write afterTRUE ~
incase_when
Rearrange columns
<- crimes |> relocate(anticatholic, .after = antimale) crimes
Export the filtered dataset!!!!!!!! 😸
write_csv(squirinfo, "SQUIRINFO.csv") # this is literally the best thing ever
write_csv(crimes, "CRIMES.csv")
- calculate means, median, etc - with NA
unique(squircensus$temperature)
[1] 70 54 60 55 66 65 74 59 77 64 56 71 61 73 58 80 67 62 57 48 68 63 79 52 46
[26] 50 81 43 53 72 49 51 40 47 76 44 69 84 78 75 45 30
mean
function (x, ...)
UseMethod("mean")
<bytecode: 0x138100e18>
<environment: namespace:base>
seq()
seq(#1, #2, by#)
# make R list times tables
seq(7, 200, 7)
[1] 7 14 21 28 35 42 49 56 63 70 77 84 91 98 105 112 119 126 133
[20] 140 147 154 161 168 175 182 189 196
seq(8, 200, 8)
[1] 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152
[20] 160 168 176 184 192 200
seq(9, 200, 9)
[1] 9 18 27 36 45 54 63 72 81 90 99 108 117 126 135 144 153 162 171
[20] 180 189 198
Simple: create a dataframe with one value each
- use
tibble()
instead ofdata.frame
ordata_frame
- can always pivot to long format if you structured it weird
<- 495
veryneg <- 250
neg <- 115
neut <- 74
pos <- 64
verypos <- 87
eh <- veryneg+neg
totalneg
<- tibble(veryneg = veryneg, neg = neg, neut = neut, pos = pos, verypost = verypos, eh = eh)
df
<- tibble(totalneg = totalneg, neut = neut, pos = pos, verypos = verypos, eh = eh) df2
I realized I wanted my data to be structured with these values in just one column, so instead of redoing I just pivoted to long format:
<- df2 %>%
df2 pivot_longer(
cols = c("totalneg", "neut", "pos", "verypos", "eh"),
names_to = "opinions")
df2
# A tibble: 5 × 2
opinions value
<chr> <dbl>
1 totalneg 745
2 neut 115
3 pos 74
4 verypos 64
5 eh 87
# percentage column - just because
<- df2 %>%
df2 mutate(percentage = (value/sum(value)))
Plotting it just for fun
<- df2 |>
p2 ggplot(aes(x = reorder(opinions, value), y = value)) +
geom_bar(stat = "identity") +
labs(x = "opinions", y = "share of population") +
theme_bw()
p2
Involved example: generate specific, biased number sets
- specify a range and assign it to a variable
- use
dnorm()
with the parametersmean
andsd
(standard deviation) to generate the probabilities of each point in this range
# temperature column
<- runif(n=365, min=-5, max=102)
temperature # for this one I just don't care. give me some crazy values.
# windspeed column
# weighted (biased) random number generation
<- 0:70 # this is the range of the wind speed
values # this parts obviously normally done horizontally. :
<- dnorm( # normal Distribution
weights # the 0:70 we just specified
values, mean = 17, # the center of the *bell curve* will be 17
sd = 7
)
Explanation: the sd = 7
is the standard deviation. about ~68% of values fall within 1 standard deviation of the mean
- every standard deviation covers more and more of your range (more distance from the mean)
- because 17 is the mean, it takes up to 3 standard deviations to get to 0
- 7 standard deviations only just gets to 70
- 7 standard deviations covers everything but only the edges get the 40:70 range.
- the lesser standard deviations covering everything else (the 1:30 range) makes them occur much more
assign a weight - a likelihood - of each value 0:70 falling on this bell curve. aka value 17 may be 5% likely while 50 may be like .01% if you open weights, its all percentages
> 40] <- weights[values > 40] * 0.2
weights[values # does the command to the weights of values above 40 (like df$col <- df$col)
# it decreases (0.X%) of all values greater than 40 to 20% of themselves
# this is an 80% decrease
>= 23 & values <= 39] <- weights[values >= 25 & values <= 39] * 0.6 # values 27:39, 50% of themselves
weights[values >= 0 & values <= 3] <- weights[values >= 0 & values <= 5] * 0.4 # values 0:50, 65% less likely
weights[values # "normalize" weights so they all add up to 1 (100%)
<- weights / sum(weights)
weights
# actually assign windspeed
<- sample(values, size = 365,
windspeed replace = TRUE, # dont worry about it
prob = weights)
# rainfall
<- seq(0, 5, by=0.1) # this allows for decimals.!!!
values <- dnorm(values, mean = 0.5, sd = 1) # small standard deviation to make most of it be small
weights >=3 & values <= 5] <- weights[values >=3 & values <= 5] * 0.5
weights[values # make really heavy rainfall 50% less likely
<- weights / sum(weights)
weights
# actually add rainfall
<- sample(values, size = 365, replace = TRUE, prob = weights) rainfall
Add together into a dataframe
<- tibble( # you use tibble instead of data.frame or data_frame
oneyear temperature = temperature, windspeed = windspeed, rainfall = rainfall)
Add a year of dates
<- oneyear |>
oneyear mutate(date = seq.Date( # creates full year of dates
from = as.Date("2023-01-01"), # honestly I dont get it I had chatgpt write this line and I'm so tired I don't really feel like figuring this part out right now
by = "day", length.out = n())) # length of this data is n()
# alternative format
<- seq.Date(from = as.Date("2023-01-01"),
w to = as.Date("2023-12-31"), by = "day")
|>
oneyear ggplot(aes(x = date, y = rainfall)) +
geom_area(stat = "identity", fill = "#345573") +
geom_bar(stat = "identity", fill = "#010A86", alpha = (rainfall)/3) +
labs(x = "Date",
y = "Rainfall (inches)",
title = "Rainfall in Maryland during year 2025") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
|>
oneyear ggplot(aes(x = date, y = temperature, color = temperature)) +
geom_line(stat = "identity") +
labs(x = "Date",
y = "Temperature (°F)",
title = "Temperature in Maryland during year 2025") +
theme_minimal() +
scale_color_gradient(low = "#E292CC", high = "#DC0000", aesthetics = "color") +
guides(color = "none") +
theme(plot.title = element_text(hjust = 0.5))
|>
oneyear ggplot(aes(x = date, y = windspeed)) +
geom_area(aes(color = "#A08DC9", alpha = (rainfall)/100)) +
geom_jitter(alpha = 0.5, color = "#321346") +
geom_jitter(alpha = 0.2, color = "#040128") +
geom_point(alpha = 0.7, color = "#070052") +
geom_line(alpha = (rainfall)/9) +
labs(x = "Date",
y = "Windspeed (mph)",
title = "Windspeed in Oregon during year 2025") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
#+ scale_alpha(guide = "none") # idk man
I feel like you just figure this out as you go along
do not use na.omit
or drop.na
on an entire dataset
Use !is.na
# example from squirrel project
<- squircensus[!is.na(squircensus$highlightfurcolor),]
subsetsquirrel <- squircensus[!is.na(squircensus$otheractivities),]
subsetsquirrel # the comma is necessary, and so is assigning the variable
|>
squircensus filter(!is.na(highlightfurcolor) & !is.na(otheractivities))
|>
squircensus filter(!is.na(highlightfurcolor) & !is.na(otheractivities))
ggplot basics
Defining the variables:
- ggplot(aes(x = column1, y = column2))
if you want to use a mean for either of these, you must assign the mean to a specific separate variable and call on that (eg: themean <- mean(data$col)
, aes(x=themean)
), or else it is most likely to fail or kill your render attempt and make you cry.
Color in ggplot2
- nondiscriminatory outlines:
aes()
with just x & y, thengeom_bar(stat = "identity", color = "#9871A8")
color
refers to the outline, andfill
to the inside; to fill the bars in a color, typefill = "#5612D6"
. can be used with or withoutcolor
also specified
- discriminatory color (different by variable):
ggplot(aes(x = col1, y = col2, color = col3))
ORfill = col3
. fill or color depends on the geom type.
- next, enter
scale_fill_
orscale_color_
on its own line to enter which color
scale_fill_brewer()
pairs with RColorBrewer library,scale_fill_viridis()
for colorblind friendly palettes
scale_fill_manual()
orscale_color_manual
assigns specific colors to specific variables in the plot.
- another option for using
scale_fill_manual()
is to assign your colors to a variable as a vector, and call on them like:scale_fill_manual(values = acolors)
scale_fill_gradient()
creates a gradient;scale_fill_gradient2()
for a multiple value gradient
- next, enter
# manual example
|>
reptemps ggplot(aes(x = location, y = temperature, fill=location, alpha = temperature)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Ground Plane" = "#790033",
"Above Ground" = "#ee3a83"))
# if you don't specify the color for NA it will be grey
# gradient example
<- reptemps |>
squirrelplot ggplot(aes(x = numberofsquirrels, y = temperature, color = temperature)) +
geom_point(alpha = 0.3)+
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color")
squirrelplot
# a painfully familiar graph.
alpha
is transparency
- assign it to a variable inside of the
aes()
to have it be variable-dependent
- assign it to a number inside of the
geom_type()
to have it apply the same to all
- like in this graph, it can be used for all points to show concentration when points overlap
- assign it to a variable inside of the
Cuter ggplot2 aesthetics
- use
labs
to specify the labels for each thingy
- add \ after a label to add a space
theme
<- squirrelplot +
squirrelplot2 labs(x = "number of squirrels",
y = "temperature (°Fahrenheit)",
title = "squirrels seen by temperature recorded",
caption = "thesquirrelcensus",
color = "TEMPERATURE °F",
alpha = "whatever man") + # name the keys based on what they were. color, fill, size, alpha, etc
geom_text(aes( # labels
alpha = temperature, # text transparency matches dots
label = round(temperature, 2)), # round is unnecessary in this case
hjust = -.5, # or vjust
color = "#790033",
size = 2.5) +
geom_text(aes(x=7.7, y=90, # coordinates
label="squirrls"), cex=14, # size(?)
color="black", angle = 0.8) +
geom_text(aes(x=16, y=40, # coordinates
label="nunber of thesquirrelcensus"), cex=8, # size(?)
color="black", angle = -0.8) +
theme_gray(base_size = 12, base_family = "serif") # many different choices, library "ggthemes" adds more
squirrelplot2
To adjust the text alignment of title, labels, etc:
# example 2
scale_fill_manual((name="Regression Model",
breaks=c("race", "sentencelength", "Cubic"),
values=c(
"Cubic"="pink",
"Quadratic="blue",
"Linear"="purple")))
I have no idea what that one means but its in my MDIncarceration project I guess for some reason(??) I am including it in case it becomes useful
Histogram and bar graphs in depth
# I don't know what I'm doing
|>
reptemps ggplot(aes(x = numberofsquirrels, color = temperature)) +
geom_histogram(color = "blue")+
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color")
geom_bar(stat = "fill")
does some cool stuff!
# whatever
Lollipop chart in ggplot2
A lollipop chart requires a categorical and numeric value, like a barchart. It is used to compare values to an overall value, like comparing the amount of crime in different cities to the average (or median) crime for all cities in the country.
It can be ideal to subset your data first to make it easier to know what you’re doing. You also need to assign the midline/baseline to a separate variable because you can’t call on a column inside aes()
The lollipop chart is a combination of three ggplot styles: geom_point
for the dots, geom_segment
for the sticks, and geom_hline
or geom_vline
for the midline (or whatever its called).
# subset: narrow down to one year & crime type
<- crimes |>
crime2016 filter(year == "2016", crimetype == "Crimes Against Persons")
# define midline, the baseline mean/average.
<- mean(crime2016$totaloffenders) offenderavg
|>
crime2016 ggplot(aes(x = county, y = totaloffenders, color = totaloffenders)) +
geom_segment(aes(x = county,
y = offenderavg,
xend = county,
yend = totaloffenders)) + # creates lines leading to each dot
geom_point(size = 4.5,
alpha = 0.8) +
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color") +
geom_hline(yintercept = offenderavg, color = "#3d3b3c", size = .5) + # creates the main horizontal line
labs(x = "County",
y = "Total offenders",
title = "2016 hate crime offenders by county compared to NY average",
caption = "Source: NY State Division of Criminal Justice Services",
color = "total offenders") +
theme_minimal(base_size = 12, base_family = "sans") +
geom_text(aes(x=9, y=86, label="Kings"), color="#56122e", size = 3.5) +
geom_text(aes(x=14, y=87, label="New York"), color="#56122e", size = 3.5) +
theme(axis.text.x=element_blank()) # remove x-axis labels for cleanness
Simple adjusted lollipop:
|>
crime2016 ggplot(aes(x = reorder(county, totaloffenders), # reorder/rearrange by value; list county sorted by totaloffenders
y = totaloffenders, color = totaloffenders)) +
geom_segment(aes(x = county,
y = offenderavg,
xend = county,
yend = totaloffenders)) +
geom_point(size = 4,
alpha = 0.8) +
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color") +
geom_hline(yintercept = offenderavg, color = "#3d3b3c", size = .5) +
labs(x = "County",
y = "Total offenders",
title = "2016 hate crime offenders by county compared to NY average",
caption = "Source: NY State Division of Criminal Justice Services",
color = "total offenders") +
theme_minimal(base_size = 12, base_family = "sans") +
theme(axis.text.x=element_blank())
- To flip the points to sort the other way, add a negative right before the value you’re sorting by:
aes(x=(reorder(county, -totaloffenders)
sometimes the lollipop chart displays 2 dots for one value. this happens if there are duplicates. to remove them, filter so there’s no duplicates
Further customization
- to make a vertical line instead, type
geom_vline
and specifyxintercept = variable
.
I have not yet been able to figure out how to arrange the points in order. it fails and fails and fails.
I have also not yet figured out how to change the colors of the lines and points separately, but you can probably do that by specifying that inaes()
Text labels:
- you can add , "%"
or similar within paste in a geom_text(aes) to have it print that.
- use round()
to round up. round(col, 1)
round to .1st place, round(col, 2)
round to .02nd place, round(col, 0)
to fully round up.
- if you have a percentage in tiny decimal format, type column * 100
to bring it to percentage format.
+
p2 geom_text(aes(label =
paste0( # needed to make the % work
round(percentage * 100, 1), # perc column, times 100 to bring it from a decimal to a percentage. , round to the 1st place.
"%")), # paste % after
size = 3, vjust = -0.4) +
geom_segment(aes(
x = 1,
y = 130,
xend = 1,
yend = 230
size = 0.4, linetype = "longdash") +
), geom_segment(aes(
x = 2,
y = 150,
xend = 2,
yend = 230
size = 0.4, linetype = "longdash") +
), geom_segment(aes(
x = 1,
y = 230,
xend = 2,
yend = 230
size = 0.4, linetype = "longdash") +
), # text label
geom_text(aes(x = 1.5, y = 260, label = "12.7% positive"), size = 3) +
# second set
geom_segment(aes(x = 3, y = 150, xend = 3, yend = 260), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 4, y = 180, xend = 4, yend = 260), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 3, y = 260, xend = 4, yend = 260), size = 0.4, linetype = "longdash") +
# text label
geom_text(aes(x = 3.5, y = 290, label = "18.6% ambivalent"), size = 3) +
geom_segment(aes(x = 1.5, y = 300, xend = 1.5, yend = 450), size = 0.4, linetype = "longdash")+
geom_segment(aes(x = 3.5, y = 330, xend = 3.5, yend = 450), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 1.5, y = 450, xend = 3.5, yend = 450), size = 0.4, linetype = "longdash") +
geom_text(aes(x = 2.5, y = 480, label = "31.3% non-negative"), size = 3) +
geom_segment(aes(x = 3, y = 510, xend = 3, yend = 610), size = 0.4, linetype = "longdash") +
# a line underneath the other lines for cuteness
geom_segment(aes(x = 3, y = 330, xend = 3, yend = 420), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 4.4, y = 610, xend = 3, yend = 610), size = 0.4, linetype = "longdash") +
geom_text(aes(x = 3.75, y = 640, label = "87.3% non-positive"), size = 3)
**paste()
vs paste0()
?
- paste()
adds a space by default.
- paste0()
means no space.
Line chart/trendlines (overlapped plots)
ggplot legends and other customization
Remove legend
+
squirrelplot2 scale_color_gradient(low = "#83D0CB", high = "#115278", aesthetics = "color") +
theme(legend.position = c(.9, .4)) +
# remove the alpha legend
scale_alpha(guide = "none")
Make TRANSPARENT
<- squirrelplot2 +
transparentplot theme(
panel.background = element_rect(fill = "transparent"),
plot.background = element_rect(fill = "transparent", color = NA)
)
ggsave("squirrelplot2.png", squirrelplot2, bg='transparent')
ggsave('transparent2.png', transparentplot, bg='transparent')
+
squirrelplot2 theme(
panel.background = element_rect(fill = "transparent", color = NA),
plot.background = element_rect(fill = "transparent", color = NA),
legend.background = element_rect(fill = "transparent", color = NA),
legend.box.background = element_rect(fill = "transparent", color = NA),
panel.grid = element_blank()
)
theme(
panel.background = element_rect(fill='transparent'), #transparent panel bg
plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
panel.grid.major = element_blank(), #remove major gridlines
panel.grid.minor = element_blank(), #remove minor gridlines
legend.background = element_rect(fill='transparent'), #transparent legend bg
legend.box.background = element_rect(fill='transparent') #transparent legend panel
)
Stacking geom_plot types
If you specify the aes(x, y)
of a plot inside of geom_
instead of ggplot
you can stack plots.
# ridiculous fake yearly weather data
<- mean(temperature)
tempmean <- mean(rainfall)
rainmean <- mean(windspeed)
windmean <- -rainfall
negrain <- -temperature
negtemp <- -windspeed
negwind <- windspeed*3
tripwind <- rainfall*2
doubrain <- temperature*2
doubtemp
ggplot(oneyear) +
# geom_area(aes(x = windspeed, y = rainfall), fill = "#f9dbbd") +
# geom_area(aes(x = tripwind, y = rainfall)) +
geom_line(aes(x = temperature, y = rainfall), color = "#a53860") + # lmfao
geom_segment(aes(x = temperature,
y = rainfall,
xend = tempmean,
yend = rainmean), color = "#450920", alpha = rainfall) +
geom_line(aes(x = rainfall, y = rainfall), color = "#ffa5ab") +
theme_void()
# geom_segment(aes(x = windspeed,
# y = rainfall,
# xend = windmean,
# yend = rainfall))
Pie charts (which you must never use)
Highcharter basics
- begin each plot with
highchart()
orhchart()
hc_add_series()
- don’t even know how color works
- figure it out yourself
Density plot
as mentioned in ggplot2 section, a density plot shows the distribution and frequency
taken straight from data110 final:
# subset
<- squircensus |>
tempsquirrel select(temperature, numberofsquirrels, primaryfurcolor) |>
arrange(primaryfurcolor)
# subset further
<- tempsquirrel |> filter(primaryfurcolor == "Gray")
greysquirrel <- tempsquirrel |> filter(primaryfurcolor == "Black")
blacksquirrel <- tempsquirrel |> filter(primaryfurcolor == "Cinnamon")
cinnamonsquirrel
hchart(
# first density plot (temperature)
density(tempsquirrel$temperature), type = "area",
color = "#ad755a",
name = "Temperature distribution") |>
# second density plot (black squirrels)
hc_add_series(
density(blacksquirrel$temperature), type = "area",
color = "#000",
name = "Black Squirrel") |>
# third density plot (grey squirrels)
hc_add_series(
density(greysquirrel$temperature), type = "area",
color = "#c5bdc9",
name = "Grey Squirrel") |>
# fourth density plot (cinnamon squirrels)
hc_add_series(
density(cinnamonsquirrel$temperature),
type = "area", name = "Cinnamon Squirrel", color = "#7f3300") |>
# text features
hc_title(text="Squirrel Activity in Different Temperatures by Color",
margin = 30,
align = "center",
style = list(color = "#4c2918") ) |>
hc_subtitle(text="Source: The Squirrel Census",
style = list(color = "#594135")) |>
hc_xAxis(title = list(text="Temperature (°F)",
margin = 5,
style = list(color = "#594135")))
Lollipop
I am unclear on how even to include the baseline
<- crimes |>
crimeconcentration filter(year == "2015", crimetype == "Property Crimes") |>
select(county, year, offenderconcentration,
"antigaymale", "antijewish", # interesting comparison
|>
totalincidents, totalvictims, totaloffenders, crimetype) filter(antijewish > 0)
highchart() |>
hc_add_series(data = crimeconcentration,
type = "lollipop", hcaes(x = county,
y = "antijewish",
group = offenderavg)) |>
# hc_colors(acolors) |>
hc_xAxis(title = list(text="County")) |>
hc_yAxis(title = list(text="Number of anti-jewish hatecrimes")) |>
hc_subtitle(text="NY State Division of Criminal Justice Services") |>
hc_title( # from rdrr.io
text = "<b>NY counties by anti Jewish hate crimes in 2015</b>",
margin = 30,
align = "center",
style = list(color = "#3c3d3c")
)
# adjust colors, make the tooltip show PERCENTAGE and not "0.0139381938503", adjust key
# in order to make a graph over years, I WOULD CONVERT TO LONG FORMAT
Bar
leafs