library(tidyverse)
library(lubridate) # date data
library(highcharter)
library(leaflet)
library(sf)
debt <- read_csv("household_debt.csv") # for a quant data example with percentages
crimes <- read_csv("hateCrimes2010.csv") # for a quantitative and somewhat categorical data example
squirinfo <- read_csv("centralparksquirrelcensus.csv") # for a categorical example that requires a ton of cleaning, and has GIS features
squirhectare <- read_csv("centralpartsquirrel_hectare.csv")r notes
List of color words R understands!!!!!! from paleturquoise to peachpuff3!!!!!!!!
to add:
☙ floor_date(date, "month") to round to nearest month; go off this eaxmple for other things
☙ pivot_long and pivot_wide
☙ theme() customization
☙ substr()
☙ mutate with case_when (more examples of how the format works)
☙ absolute value
☙ time/date data, lubridate
☙ stacking plots (just slightly more)
☙ quarto/markdown customiation, adjusting the html and css, how to make a dashboard, shiny: maybe just make a whole new document for this
str_detect is used for logical vectors which is TRUE, FALSE, or NA
- seq(), “Generate regular sequences. seq is a standard generic with a default method. seq.int is a primitive which can be much faster but has a few restrictions. seq_along and seq_len are very fast primitives for two common cases.”
PIPING: done with |> or %>%
+ is only used in ggplot to connect parts of your code together
$ refers to column
! means NOT (!=) NOT EQUAL
= is the same as <- == equal to this value
>= greater than or equal to
NaN not a number NA NA
Add message = FALSE or results = FALSE right after {r to make it just show the code and not the output in your rendered document. can use this to hide the messages but only show your graph.
type echo = FALSE to only print the output and hide the code. can use this to hide your code and just show your plot, but it might also include additional messages (not clear)
type include = FALSE to totally hide everything
sometimes you still see crap under your chunk after all of this, in which case add warning = FALSE
⭑ for extra insurance you can add this to your yaml:
execute:
message: false
Knitr/render/rpubs:
If you experience errors when knitting/rendering:
1. Clean global environment
2. Run each chunk from the beginning one by one
3. Make sure you don’t call any variables that you haven’t yet named
4. Make sure there’s a space after any the ##)
5. Make sure the yaml is normal
6. Make sure you actually loaded all the libraries in the correct order! (it’s this more than you think)
Always use read_csv (from tidyverse), and never read.csv
# adjust column names/titles
names(crimes) <- tolower(names(crimes))
names(crimes) <- gsub(" ","",names(crimes))
names(crimes) <- gsub("-","",names(crimes)) # remove troublesome symbols
names(debt) <- tolower(names(debt))
names(debt) <- gsub(" ","",names(debt))
names(squirinfo) <- tolower(names(squirinfo))
names(squirinfo) <- gsub(" ","",names(squirinfo))
names(squirhectare) <- tolower(names(squirhectare))
names(squirhectare) <- gsub(" ","",names(squirhectare))
squircensus <- left_join(squirhectare, squirinfo) # it joined by hectare
invisible(head(crimes)) # use invisible() before a command in QMD or RMD to make it not print into the outputExcel formulas…
- You can type something like… =E2*E4 and then click the cell and drag it down to cover whatever you want
- To round, highlight what you want to round and click the thingy, in the home menu, under and to the right of “General”. the thing that looks like <-0 .00
Using base R, tidyverse
Remove rows/columns
- rows
brokensquirrel <- squircensus[-c(3:66, 68:87, 90:757, 760:1000, 1001:2536, 2540:3102), ]- columns (
select)
# remove columns
brokensquirrel <- brokensquirrel |> select(-sighterobservedweatherdata, -litternotes, -otheranimalsightings, -hectareconditions)
# select ONLY columns
brokensquirrel <- brokensquirrel |> select(litter, totaltimeofsighting, otheractivities, otherinteractions)
brokensquirrel# A tibble: 10 × 4
litter totaltimeofsighting otheractivities otherinteractions
<chr> <dbl> <chr> <chr>
1 Some 22 <NA> <NA>
2 Some 22 <NA> <NA>
3 None 20 <NA> <NA>
4 Abundant 25 <NA> couldn't get near
5 Abundant 25 <NA> <NA>
6 None 25 <NA> <NA>
7 Some 30 <NA> <NA>
8 Some 25 <NA> <NA>
9 Some 25 jumping stared at me
10 Some 25 laid down on the branch <NA>
# only 4 columns - and only 10 rows, because of previous chunksupposedly can also do this: table(dat$Marital_status, dat$approval_status) (source https://www.pluralsight.com/resources/blog/guides/testing-for-relationships-between-categorical-variables-using-the-chi-square-test)
Strip down to just numbers/similar - remove text
ex: transform messy weather data into just temperature
# turning the "sighterobservedweatherdata" column into just "temperature"
squircensus <- squircensus |>
mutate(temperature = gsub("\\D", "", sighterobservedweatherdata)) #"\\D", "", does , uh, idk, it filled out blank but the next one fixed it
squircensus$temperature <- as.numeric(as.character(squircensus$temperature))
# some were input "~72-73", so:
squircensus <- squircensus |> filter(temperature <99) # now that I think about it this just removed those columns an didn't fix them lmaoJust remove text
textlesssquirrel <- squircensus %>%
mutate(sighterobservedweatherdata = as.numeric(gsub("[^0-9]", "", sighterobservedweatherdata)))
# it has to be ^ within the brackets. ! does not work at allPrinting, counting, sums
Count number of distinct/non repeated/whatnot:
# authorcount = n_distinct(author)Make a comparison column with mutate
crimes <- crimes |>
mutate(antitransvsgay = antigaymale - antitransgender)Filtering
Filter for repetition (multiple instances only) - using n()
Select only rows with duplicates (repeats in the column specified in group_by)
reptemps <-
squircensus |> group_by(temperature) |> filter(n() >1) # group_by then pipe means search just this column, n() is numberFilter pieces of text - using grep
ex: select only squirrels with “mushroom” in their other notes
fungallyinclinedsquirrels <- subset(squircensus, grepl("mushroom", squircensus$otheractivities))
fungallyinclinedsquirrelsgrepl:
Filter for absolute value
crimes |> filter(abs(antitransvsgay) > 0.5)
# it has to be written exactly like this: filter(abs(value) > #)group_by, summarise, n()
Use group_by() on its own or pair with summarise()
Can also use reframe() instead of summarise()
I am not quite sure what the difference is, here is what the message says:
Warning: Returning more (or less) than 1 row per `summarise()` group was
deprecated in dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that
`reframe()` always returns an ungrouped data frame and adjust
accordingly.
Call `lifecycle::last_lifecycle_warnings()` to see where this
warning was generated.
`summarise()` has grouped output by 'anonymizedsighter'. You can
override using the `.groups` argument.
# something
squirred <- squircensus |>
group_by(anonymizedsighter) |> # this will be the first column of your subset, if you follow with the rest of this chunk
# it would help a lot to turn "anonymizedsighter" into character/categorical data for this
reframe( # previously summairse() : it's like mutate, define new columns for subset
squirrelsspotted = n(), # new column "squirrelsspotted" is the "n()" - number of times anonymizedsighter repeats
shift = shift,
hectare = hectare,
location = location
) note: if you find that your grouped_by variable repeats, that’s because one of the columns you’re calling has at least one different row per one group_by row
if that doesn’t help, you can apply first() to the columns that are messing it up (eg, if “location” is the issue, location = first(location)) - this will take the first given row for that item in the “location” column and apply it to all (eg indexing)
Make new column with mutate
weirddebt <- debt |>
mutate(hugeloan = (autoloan*studentloan)*1000000000000,
meanmoney = mean((mortgage + creditcard)),
meanmoney = mean((mortgage & creditcard)),
# I'm going to be honest Idk what + or & really means in this particular situation
)
# I don't know why you would do this but you can, you can do pretty much anythingCombine columns with mutate, case_when, or if_else
squirinfo <- squirinfo |>
mutate(squirrel = case_when(
!is.na(hectaresquirrelnumber) & !is.na(age) ~
paste(hectaresquirrelnumber, age),
TRUE ~ NA_character_
))
# you need case_when() if you want to do something like this, making it return NA in the new column when any (or specifics) of the called-upon columns are NA
squirinfo <- squirinfo |>
mutate(squirrel = if_else(
!is.na(hectaresquirrelnumber) & !is.na(age),
paste(hectaresquirrelnumber, age),
NA_character_
))
# these do the same thing in this instance, but case_when is more versatile or something (can handle more than just two thingies. also the fallback is written differently at the end there.)Rename a single column
from Statology.org
# dplyr: specify column's birthname
crimes <- crimes %>% rename_at('antiage*', ~'antiage')
# dplyr: respectfully specify column by position
crimes <- crimes %>% rename_at(21, ~'antiunreligious')
# base r: column birthname
#colnames(crimes)[colnames(crimes) == 'old_name'] <- 'new_name'
# when you think about it, this method is actually more intuitive.
# base r: column position
#colnames(crimes)[25] <- 'antieasternorthodox'Renaming with mutate and case_when (also NAs)
Categorical/character/qualitative:
# source: failed attempt plugged into ChatGPT
crimes <- crimes|>
mutate(town = case_when(
county %in% c("Albany", "Allegany") ~ "A towns",
county %in% c("Bronx", "Broome") ~ "B towns",
county %in% c("Cattaraugus", "Cayuga", "Chautauqua", "Chemung", "Chenango", "Clinton", "Columbia", "Cortland") ~ "C Towns",
county %in% c("Dutchess") ~ "B Towns",
TRUE ~ as.character(county)
))Numeric/quantitative:
crimes <- crimes |> mutate(offenderconcentration = case_when(
totaloffenders == 0 ~ "None",
totaloffenders >= 1 & totaloffenders <= 5 ~ "Very low",
totaloffenders >= 5 & totaloffenders <= 15 ~ "Low",
totaloffenders >= 15 & totaloffenders <= 35 ~ "Medium",
totaloffenders >= 35 & totaloffenders <= 50 ~ "Moderate",
totaloffenders >= 50 & totaloffenders <= 70 ~ "High",
totaloffenders >= 70 & totaloffenders <= 90 ~ "Super high",
totaloffenders >= 90 & totaloffenders <= 200 ~ "Serious issues",
TRUE ~ NA_character_
))TRUE ~ at the end defines what to do with data that don’t match what’s specified by case_when.
TRUE ~ NA_character_ fills it with NA
TRUE ~ as.character(column) has it fill in exactly what it said in the column
and TRUE ~ "whatever" has it print what you specify in the quotes. if you don’t include this at all it’ll fill in NA
If_else vs case_when
case_when() and if_else() (a tidyverse version of ifelse()) are very similar
if_else()is the fastest (?)
if_else()only uses 2 values, one if and one else.case_when()can use more than 2, of any data type
supposedly one of the issues with
ifelse()is that TRUE/FALSE values confuse it
it would appear that when you use
case_whenyou useTRUE ~ ...as your fallback, while withif_elseyou write whatever you were going to write afterTRUE ~incase_when
Rearrange columns
crimes <- crimes |> relocate(anticatholic, .after = antimale)Export the filtered dataset!!!!!!!! 😸
write_csv(squircensus, "SQUIRCENSUS.csv") # this is literally the best thing ever
write_csv(crimes, "CRIMES.csv")Stat reminder
Stat Parameter
Measures of center
x̄ estimates µ
- calculate means, median, etc - with NA
unique(squircensus$temperature) [1] 70 54 60 55 66 65 74 59 77 64 56 71 61 73 58 80 67 62 57 48 68 63 79 52 46
[26] 50 81 43 53 72 49 51 40 47 76 44 69 84 78 75 45 30
meanfunction (x, ...)
UseMethod("mean")
<bytecode: 0x12baf6890>
<environment: namespace:base>
seq()
seq(#1, #2, by#)
# make R list times tables
seq(7, 200, 7) [1] 7 14 21 28 35 42 49 56 63 70 77 84 91 98 105 112 119 126 133
[20] 140 147 154 161 168 175 182 189 196
seq(8, 200, 8) [1] 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120 128 136 144 152
[20] 160 168 176 184 192 200
seq(9, 200, 9) [1] 9 18 27 36 45 54 63 72 81 90 99 108 117 126 135 144 153 162 171
[20] 180 189 198
The library lubridate is used for most of this…
Simple: create a dataframe with one value each
- use
tibble()instead ofdata.frameordata_frame
- can always pivot to long format if you structured it weird
veryneg <- 495
neg <- 250
neut <- 115
pos <- 74
verypos <- 64
eh <- 87
totalneg <- veryneg+neg
df <- tibble(veryneg = veryneg, neg = neg, neut = neut, pos = pos, verypost = verypos, eh = eh)
df2 <- tibble(totalneg = totalneg, neut = neut, pos = pos, verypos = verypos, eh = eh)I realized I wanted my data to be structured with these values in just one column, so instead of redoing I just pivoted to long format:
df2 <- df2 %>%
pivot_longer(
cols = c("totalneg", "neut", "pos", "verypos", "eh"),
names_to = "opinions")
df2# A tibble: 5 × 2
opinions value
<chr> <dbl>
1 totalneg 745
2 neut 115
3 pos 74
4 verypos 64
5 eh 87
# percentage column - just because
df2 <- df2 %>%
mutate(percentage = (value/sum(value)))Plotting it just for fun
p2 <- df2 |>
ggplot(aes(x = reorder(opinions, value), y = value)) +
geom_bar(stat = "identity") +
labs(x = "opinions", y = "share of population") +
theme_bw()
p2Involved example: generate specific, biased number sets
- specify a range and assign it to a variable
- use
dnorm()with the parametersmeanandsd(standard deviation) to generate the probabilities of each point in this range
# temperature column
temperature <- runif(n=365, min=-5, max=102)
# for this one I just don't care. give me some crazy values.
# windspeed column
# weighted (biased) random number generation
values <- 0:70 # this is the range of the wind speed
# this parts obviously normally done horizontally. :
weights <- dnorm( # normal Distribution
values, # the 0:70 we just specified
mean = 17, # the center of the *bell curve* will be 17
sd = 7
)Explanation: the sd = 7 is the standard deviation. about ~68% of values fall within 1 standard deviation of the mean
- every standard deviation covers more and more of your range (more distance from the mean)
- because 17 is the mean, it takes up to 3 standard deviations to get to 0
- 7 standard deviations only just gets to 70
- 7 standard deviations covers everything but only the edges get the 40:70 range.
- the lesser standard deviations covering everything else (the 1:30 range) makes them occur much more
assign a weight - a likelihood - of each value 0:70 falling on this bell curve. aka value 17 may be 5% likely while 50 may be like .01% if you open weights, its all percentages
weights[values > 40] <- weights[values > 40] * 0.2
# does the command to the weights of values above 40 (like df$col <- df$col)
# it decreases (0.X%) of all values greater than 40 to 20% of themselves
# this is an 80% decrease
weights[values >= 23 & values <= 39] <- weights[values >= 25 & values <= 39] * 0.6 # values 27:39, 50% of themselves
weights[values >= 0 & values <= 3] <- weights[values >= 0 & values <= 5] * 0.4 # values 0:50, 65% less likely
# "normalize" weights so they all add up to 1 (100%)
weights <- weights / sum(weights)
# actually assign windspeed
windspeed <- sample(values, size = 365,
replace = TRUE, # dont worry about it
prob = weights)
# rainfall
values <- seq(0, 5, by=0.1) # this allows for decimals.!!!
weights <- dnorm(values, mean = 0.5, sd = 1) # small standard deviation to make most of it be small
weights[values >=3 & values <= 5] <- weights[values >=3 & values <= 5] * 0.5
# make really heavy rainfall 50% less likely
weights <- weights / sum(weights)
# actually add rainfall
rainfall <- sample(values, size = 365, replace = TRUE, prob = weights)Add together into a dataframe
oneyear <- tibble( # you use tibble instead of data.frame or data_frame
temperature = temperature, windspeed = windspeed, rainfall = rainfall)Add a year of dates
oneyear <- oneyear |>
mutate(date = seq.Date( # creates full year of dates
from = as.Date("2023-01-01"), # honestly I dont get it I had chatgpt write this line and I'm so tired I don't really feel like figuring this part out right now
by = "day", length.out = n())) # length of this data is n()
# alternative format
w <- seq.Date(from = as.Date("2023-01-01"),
to = as.Date("2023-12-31"), by = "day")
write_csv(oneyear, "oneyear.csv")oneyear |>
ggplot(aes(x = date, y = rainfall)) +
geom_area(stat = "identity", fill = "#345573") +
geom_bar(stat = "identity", fill = "#010A86", alpha = (rainfall)/3) +
labs(x = "Date",
y = "Rainfall (inches)",
title = "Rainfall in Washington during year 2025") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))oneyear |>
ggplot(aes(x = date, y = temperature, color = temperature)) +
geom_line(stat = "identity") +
labs(x = "Date",
y = "Temperature (°F)",
title = "Temperature in Washington during year 2025") +
theme_minimal() +
scale_color_gradient(low = "#E292CC", high = "#DC0000", aesthetics = "color") +
guides(color = "none") +
theme(plot.title = element_text(hjust = 0.5))oneyear |>
ggplot(aes(x = date, y = windspeed)) +
geom_area(aes(color = "#A08DC9", alpha = (rainfall)/100)) +
geom_jitter(alpha = 0.5, color = "#321346") +
geom_jitter(alpha = 0.2, color = "#040128") +
geom_point(alpha = 0.7, color = "#070052") +
geom_line(alpha = (rainfall)/9) +
labs(x = "Date",
y = "Windspeed (mph)",
title = "Windspeed in Oregon during year 2025") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))#+ scale_alpha(guide = "none") # idk manI feel like you just figure this out as you go along
do not use na.omit or drop.na on an entire dataset
Use !is.na
# example from squirrel project
subsetsquirrel <- squircensus[!is.na(squircensus$highlightfurcolor),]
subsetsquirrel <- squircensus[!is.na(squircensus$otheractivities),]
# the comma is necessary, and so is assigning the variable
squircensus |>
filter(!is.na(highlightfurcolor) & !is.na(otheractivities))
squircensus |>
filter(!is.na(highlightfurcolor) & !is.na(otheractivities)) ggplot basics
Defining the variables:
- ggplot(aes(x = column1, y = column2))
if you want to use a mean for either of these, you must assign the mean to a specific separate variable and call on that (eg: themean <- mean(data$col), aes(x=themean)), or else it is most likely to fail or kill your render attempt and make you cry.
Color in ggplot2
- nondiscriminatory outlines:
aes()with just x & y, thengeom_bar(stat = "identity", color = "#9871A8")
colorrefers to the outline, andfillto the inside; to fill the bars in a color, typefill = "#5612D6". can be used with or withoutcoloralso specified
- discriminatory color (different by variable):
ggplot(aes(x = col1, y = col2, color = col3))ORfill = col3. fill or color depends on the geom type.
- next, enter
scale_fill_orscale_color_on its own line to enter which color
scale_fill_brewer()pairs with RColorBrewer library,scale_fill_viridis()for colorblind friendly palettes
scale_fill_manual()orscale_color_manualassigns specific colors to specific variables in the plot.
- another option for using
scale_fill_manual()is to assign your colors to a variable as a vector, and call on them like:scale_fill_manual(values = acolors)
scale_fill_gradient()creates a gradient;scale_fill_gradient2()for a multiple value gradient
- next, enter
# manual example
reptemps |>
ggplot(aes(x = location, y = temperature, fill=location, alpha = temperature)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Ground Plane" = "#790033",
"Above Ground" = "#ee3a83"))# if you don't specify the color for NA it will be grey
# gradient example
squirrelplot <- reptemps |>
ggplot(aes(x = numberofsquirrels, y = temperature, color = temperature)) +
geom_point(alpha = 0.3)+
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color")
squirrelplot# a painfully familiar graph.alphais transparency
- assign it to a variable inside of the
aes()to have it be variable-dependent
- assign it to a number inside of the
geom_type()to have it apply the same to all
- like in this graph, it can be used for all points to show concentration when points overlap
- assign it to a variable inside of the
Cuter ggplot2 aesthetics
- use
labsto specify the labels for each thingy
- add \ after a label to add a space
theme
squirrelplot2 <- squirrelplot +
labs(x = "number of squirrels",
y = "temperature (°Fahrenheit)",
title = "squirrels seen by temperature recorded",
caption = "thesquirrelcensus",
color = "TEMPERATURE °F",
alpha = "whatever man") + # name the keys based on what they were. color, fill, size, alpha, etc
geom_text(aes( # labels
alpha = temperature, # text transparency matches dots
label = round(temperature, 2)), # round is unnecessary in this case
hjust = -.5, # or vjust
color = "#790033",
size = 2.5) +
geom_text(aes(x=7.7, y=90, # coordinates
label="squirrls"), cex=14, # size(?)
color="black", angle = 0.8) +
geom_text(aes(x=16, y=40, # coordinates
label="nunber of thesquirrelcensus"), cex=8, # size(?)
color="black", angle = -0.8) +
theme_gray(base_size = 12, base_family = "serif") # many different choices, library "ggthemes" adds more
squirrelplot2To adjust the text alignment of title, labels, etc:
Specific color to variable with scale_whatever_manual
# subset: narrow down to one year & crime type
crime2016 <- crimes |>
filter(year == "2016", crimetype == "Crimes Against Persons")
ggplot(crime2016) +
geom_bar(aes(y = county, x = totalincidents,
# instead of a color/fill, name the column/variable
fill = "Total incidents of all crime"),
color = "#FFCB50", stat = "identity", alpha = 0.1) +
geom_bar(aes(y = county, x = antigaymale,
fill = "Anti gay male crime"),
color = "#3D3387", stat = "identity", alpha = 0.5) +
geom_bar(aes(y = county, x = antijewish,
fill = "Anti Jewish crime"),
color = "#ef4c2f", stat = "identity", alpha = 0.5) +
# now use scale_fill_manual (or scale_color_manual) to define the above "colors"
scale_fill_manual(values = c(
"Total incidents of all crime" = "#FFCB50",
"Anti Jewish crime" = "#ef4c2f",
"Anti gay male crime" = "#3D3387"
)) +
theme_minimal() +
labs(
title = "Most common hate crime committed in 2016, by city in NY",
x = "Incidents of hate crime",
y = "", # looks nicer without the "city" label, and I think y-axis is obvious
fill = "Crime"
)# example 2
scale_fill_manual((name="Regression Model",
breaks=c("race", "sentencelength", "Cubic"),
values=c(
"Cubic"="pink",
"Quadratic="blue",
"Linear"="purple")))
I have no idea what that one means but its in my Incarceration project I guess for some reason(??) I am including it in case it becomes useful
Histogram and bar graphs in depth
# I don't know what I'm doing
reptemps |>
ggplot(aes(x = numberofsquirrels, color = temperature)) +
geom_histogram(color = "blue")+
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color")geom_bar(stat = "fill") does some cool stuff!
# whateverLollipop chart in ggplot2
A lollipop chart requires a categorical and numeric value, like a barchart. It is used to compare values to an overall value, like comparing the amount of crime in different cities to the average (or median) crime for all cities in the country.
It can be ideal to subset your data first to make it easier to know what you’re doing. You also need to assign the midline/baseline to a separate variable because you can’t call on a column inside aes()
The lollipop chart is a combination of three ggplot styles: geom_point for the dots, geom_segment for the sticks, and geom_hline or geom_vline for the midline (or whatever its called).
# define midline, the baseline mean/average.
offenderavg <- mean(crime2016$totaloffenders)crime2016 |>
ggplot(aes(x = county, y = totaloffenders, color = totaloffenders)) +
geom_segment(aes(x = county,
y = offenderavg,
xend = county,
yend = totaloffenders)) + # creates lines leading to each dot
geom_point(size = 4.5,
alpha = 0.8) +
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color") +
geom_hline(yintercept = offenderavg, color = "#3d3b3c", size = .5) + # creates the main horizontal line
labs(x = "County",
y = "Total offenders",
title = "2016 hate crime offenders by county compared to NY average",
caption = "Source: NY State Division of Criminal Justice Services",
color = "total offenders") +
theme_minimal(base_size = 12, base_family = "sans") +
geom_text(aes(x=9, y=86, label="Kings"), color="#56122e", size = 3.5) +
geom_text(aes(x=14, y=87, label="New York"), color="#56122e", size = 3.5) +
theme(axis.text.x=element_blank()) # remove x-axis labels for cleannessSimple adjusted lollipop:
crime2016 |>
ggplot(aes(x = reorder(county, totaloffenders), # reorder/rearrange by value; list county sorted by totaloffenders
y = totaloffenders, color = totaloffenders)) +
geom_segment(aes(x = county,
y = offenderavg,
xend = county,
yend = totaloffenders)) +
geom_point(size = 4,
alpha = 0.8) +
scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color") +
geom_hline(yintercept = offenderavg, color = "#3d3b3c", size = .5) +
labs(x = "County",
y = "Total offenders",
title = "2016 hate crime offenders by county compared to NY average",
caption = "Source: NY State Division of Criminal Justice Services",
color = "total offenders") +
theme_minimal(base_size = 12, base_family = "sans") +
theme(axis.text.x=element_blank())- To flip the points to sort the other way, add a negative right before the value you’re sorting by:
aes(x=(reorder(county, -totaloffenders)
sometimes the lollipop chart displays 2 dots for one value. this happens if there are duplicates. to remove them, filter so there’s no duplicates
Further customization
- to make a vertical line instead, type
geom_vlineand specifyxintercept = variable.
I have not yet been able to figure out how to arrange the points in order. it fails and fails and fails.
I have also not yet figured out how to change the colors of the lines and points separately, but you can probably do that by specifying that inaes()
Text labels:
- you can add , "%" or similar within paste in a geom_text(aes) to have it print that.
- use round() to round up. round(col, 1) round to .1st place, round(col, 2) round to .02nd place, round(col, 0) to fully round up.
- if you have a percentage in tiny decimal format, type column * 100 to bring it to percentage format.
p2 +
geom_text(aes(label =
paste0( # needed to make the % work
round(percentage * 100, 1), # perc column, times 100 to bring it from a decimal to a percentage. , round to the 1st place.
"%")), # paste % after
size = 3, vjust = -0.4) +
geom_segment(aes(
x = 1,
y = 130,
xend = 1,
yend = 230
), size = 0.4, linetype = "longdash") +
geom_segment(aes(
x = 2,
y = 150,
xend = 2,
yend = 230
), size = 0.4, linetype = "longdash") +
geom_segment(aes(
x = 1,
y = 230,
xend = 2,
yend = 230
), size = 0.4, linetype = "longdash") +
# text label
geom_text(aes(x = 1.5, y = 260, label = "12.7% positive"), size = 3) +
# second set
geom_segment(aes(x = 3, y = 150, xend = 3, yend = 260), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 4, y = 180, xend = 4, yend = 260), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 3, y = 260, xend = 4, yend = 260), size = 0.4, linetype = "longdash") +
# text label
geom_text(aes(x = 3.5, y = 290, label = "18.6% ambivalent"), size = 3) +
geom_segment(aes(x = 1.5, y = 300, xend = 1.5, yend = 450), size = 0.4, linetype = "longdash")+
geom_segment(aes(x = 3.5, y = 330, xend = 3.5, yend = 450), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 1.5, y = 450, xend = 3.5, yend = 450), size = 0.4, linetype = "longdash") +
geom_text(aes(x = 2.5, y = 480, label = "31.3% non-negative"), size = 3) +
geom_segment(aes(x = 3, y = 510, xend = 3, yend = 610), size = 0.4, linetype = "longdash") +
# a line underneath the other lines for cuteness
geom_segment(aes(x = 3, y = 330, xend = 3, yend = 420), size = 0.4, linetype = "longdash") +
geom_segment(aes(x = 4.4, y = 610, xend = 3, yend = 610), size = 0.4, linetype = "longdash") +
geom_text(aes(x = 3.75, y = 640, label = "87.3% non-positive"), size = 3)**paste() vs paste0()?
- paste() adds a space by default.
- paste0() means no space.
Line chart/trendlines (overlapped plots)
ggplot legends and other customization
Remove legend
squirrelplot2 +
scale_color_gradient(low = "#83D0CB", high = "#115278", aesthetics = "color") +
theme(legend.position = c(.9, .4)) +
# remove the alpha legend
scale_alpha(guide = "none")Make TRANSPARENT
transparentplot <- squirrelplot2 +
theme(
panel.background = element_rect(fill = "transparent"),
plot.background = element_rect(fill = "transparent", color = NA)
)ggsave("squirrelplot2.png", squirrelplot2, bg='transparent')
ggsave('transparent2.png', transparentplot, bg='transparent')squirrelplot2 +
theme(
panel.background = element_rect(fill = "transparent", color = NA),
plot.background = element_rect(fill = "transparent", color = NA),
legend.background = element_rect(fill = "transparent", color = NA),
legend.box.background = element_rect(fill = "transparent", color = NA),
panel.grid = element_blank()
) theme(
panel.background = element_rect(fill='transparent'), #transparent panel bg
plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
panel.grid.major = element_blank(), #remove major gridlines
panel.grid.minor = element_blank(), #remove minor gridlines
legend.background = element_rect(fill='transparent'), #transparent legend bg
legend.box.background = element_rect(fill='transparent') #transparent legend panel
)
Stacking geom_plot types
If you specify the aes(x, y) of a plot inside of geom_ instead of ggplot you can stack plots.
# ridiculous fake yearly weather data
tempmean <- mean(temperature)
rainmean <- mean(rainfall)
windmean <- mean(windspeed)
negrain <- -rainfall
negtemp <- -temperature
negwind <- -windspeed
tripwind <- windspeed*3
doubrain <- rainfall*2
doubtemp <- temperature*2
ggplot(oneyear) +
# geom_area(aes(x = windspeed, y = rainfall), fill = "#f9dbbd") +
# geom_area(aes(x = tripwind, y = rainfall)) +
geom_line(aes(x = temperature, y = rainfall), color = "#a53860") + # lmfao
geom_segment(aes(x = temperature,
y = rainfall,
xend = tempmean,
yend = rainmean), color = "#450920", alpha = rainfall) +
geom_line(aes(x = rainfall, y = rainfall), color = "#ffa5ab") +
labs(title = "One year of rainfall in Oregon") +
theme(
plot.title = element_text(hjust = 5, face = "bold")) +
theme_void() # geom_segment(aes(x = windspeed,
# y = rainfall,
# xend = windmean,
# yend = rainfall))Pie charts (which you must never use)
Highcharter basics
- begin each plot with
highchart()orhchart()
hc_add_series()
- don’t even know how color works
- figure it out yourself
- chatgpt is not going to help you with this
- if you don’t understand anything about java it’s best to give up on highcharter and find another library (a note to self)
Density plot
as mentioned in ggplot2 section, a density plot shows the distribution and frequency
taken straight from data final:
# subset
tempsquirrel <- squircensus |>
select(temperature, numberofsquirrels, primaryfurcolor) |>
arrange(primaryfurcolor)
# subset further
greysquirrel <- tempsquirrel |> filter(primaryfurcolor == "Gray")
blacksquirrel <- tempsquirrel |> filter(primaryfurcolor == "Black")
cinnamonsquirrel <- tempsquirrel |> filter(primaryfurcolor == "Cinnamon")
hchart(
# first density plot (temperature)
density(tempsquirrel$temperature), type = "area",
color = "#ad755a",
name = "Temperature distribution") |>
# second density plot (black squirrels)
hc_add_series(
density(blacksquirrel$temperature), type = "area",
color = "#000",
name = "Black Squirrel") |>
# third density plot (grey squirrels)
hc_add_series(
density(greysquirrel$temperature), type = "area",
color = "#c5bdc9",
name = "Grey Squirrel") |>
# fourth density plot (cinnamon squirrels)
hc_add_series(
density(cinnamonsquirrel$temperature),
type = "area", name = "Cinnamon Squirrel", color = "#7f3300") |>
# text features
hc_title(text="Squirrel Activity in Different Temperatures by Color",
margin = 30,
align = "center",
style = list(color = "#4c2918") ) |>
hc_subtitle(text="Source: The Squirrel Census",
style = list(color = "#594135")) |>
hc_xAxis(title = list(text="Temperature (°F)",
margin = 5,
style = list(color = "#594135"))) Lollipop
I am unclear on how even to include the baseline
crimeconcentration <- crimes |>
filter(year == "2015", crimetype == "Property Crimes") |>
select(county, year, offenderconcentration,
"antigaymale", "antijewish",
totalincidents, totalvictims, totaloffenders, crimetype) |>
filter(antijewish > 0)
highchart() |>
hc_add_series(data = crimeconcentration,
type = "lollipop", hcaes(x = county,
y = "antijewish",
group = offenderavg)) |>
# hc_colors(acolors) |>
hc_xAxis(title = list(text="County")) |>
hc_yAxis(title = list(text="Number of anti-jewish hatecrimes")) |>
hc_subtitle(text="NY State Division of Criminal Justice Services") |>
hc_title( # from rdrr.io
text = "<b>NY counties by anti Jewish hate crimes in 2015</b>",
margin = 30,
align = "center",
style = list(color = "#3c3d3c")
)Bar
leafs
basics….
- theres like multiple types of functions
- in R, you need brackets