r notes

Author

Albert

List of color words R understands!!!!!! from paleturquoise to peachpuff3!!!!!!!!

to add:
☙ floor_date(date, "month") to round to nearest month; go off this eaxmple for other things
☙ pivot_long and pivot_wide
☙ theme() customization
☙ substr()
☙ mutate with case_when (more examples of how the format works)
☙ absolute value
☙ time/date data, lubridate
☙ stacking plots (just slightly more)
☙ quarto/markdown customiation, adjusting the html and css, how to make a dashboard, shiny: maybe just make a whole new document for this
str_detect is used for logical vectors which is TRUE, FALSE, or NA
- seq(), “Generate regular sequences. seq is a standard generic with a default method. seq.int is a primitive which can be much faster but has a few restrictions. seq_along and seq_len are very fast primitives for two common cases.”

PIPING: done with |> or %>%
+ is only used in ggplot to connect parts of your code together
$ refers to column
! means NOT (!=) NOT EQUAL
= is the same as <- == equal to this value
>= greater than or equal to
NaN not a number NA NA

Add message = FALSE or results = FALSE right after {r to make it just show the code and not the output in your rendered document. can use this to hide the messages but only show your graph.
type echo = FALSE to only print the output and hide the code. can use this to hide your code and just show your plot, but it might also include additional messages (not clear)
type include = FALSE to totally hide everything

sometimes you still see crap under your chunk after all of this, in which case add warning = FALSE

⭑ for extra insurance you can add this to your yaml:

execute:
message: false

Knitr/render/rpubs:
If you experience errors when knitting/rendering:
1. Clean global environment
2. Run each chunk from the beginning one by one
3. Make sure you don’t call any variables that you haven’t yet named
4. Make sure there’s a space after any the ##)
5. Make sure the yaml is normal
6. Make sure you actually loaded all the libraries in the correct order! (it’s this more than you think)

library(tidyverse)
library(lubridate) # date data
library(highcharter)
library(leaflet)
library(sf)

debt <- read_csv("household_debt.csv") # for a quant data example with percentages
crimes <- read_csv("hateCrimes2010.csv") # for a quantitative and somewhat categorical data example
squirinfo <- read_csv("centralparksquirrelcensus.csv") # for a categorical example that requires a ton of cleaning, and has GIS features
squirhectare <- read_csv("centralpartsquirrel_hectare.csv")

Always use read_csv (from tidyverse), and never read.csv

# adjust column names/titles
names(crimes) <- tolower(names(crimes))
names(crimes) <- gsub(" ","",names(crimes))
names(crimes) <- gsub("-","",names(crimes)) # remove troublesome symbols

names(debt) <- tolower(names(debt))
names(debt) <- gsub(" ","",names(debt))

names(squirinfo) <- tolower(names(squirinfo))
names(squirinfo) <- gsub(" ","",names(squirinfo))
names(squirhectare) <- tolower(names(squirhectare))
names(squirhectare) <- gsub(" ","",names(squirhectare))

squircensus <- left_join(squirhectare, squirinfo) # it joined by hectare


invisible(head(crimes)) # use invisible() before a command in QMD or RMD to make it not print into the output

Excel formulas…
- You can type something like… =E2*E4 and then click the cell and drag it down to cover whatever you want
- To round, highlight what you want to round and click the thingy, in the home menu, under and to the right of “General”. the thing that looks like <-0 .00

Using base R, tidyverse

Remove rows/columns

rows

brokensquirrel <- squircensus[-c(3:66, 68:87, 90:757, 760:1000, 1001:2536, 2540:3102), ]

columns (select)

# remove columns
brokensquirrel <- brokensquirrel |> select(-sighterobservedweatherdata, -litternotes, -otheranimalsightings, -hectareconditions)
# select ONLY columns
brokensquirrel <- brokensquirrel |> select(litter, totaltimeofsighting, otheractivities, otherinteractions)
brokensquirrel

# A tibble: 10 × 4
   litter   totaltimeofsighting otheractivities         otherinteractions
   <chr>                  <dbl> <chr>                   <chr>            
 1 Some                      22 <NA>                    <NA>             
 2 Some                      22 <NA>                    <NA>             
 3 None                      20 <NA>                    <NA>             
 4 Abundant                  25 <NA>                    couldn't get near
 5 Abundant                  25 <NA>                    <NA>             
 6 None                      25 <NA>                    <NA>             
 7 Some                      30 <NA>                    <NA>             
 8 Some                      25 <NA>                    <NA>             
 9 Some                      25 jumping                 stared at me     
10 Some                      25 laid down on the branch <NA>

# only 4 columns - and only 10 rows, because of previous chunk

supposedly can also do this: table(dat$Marital_status, dat$approval_status) (source https://www.pluralsight.com/resources/blog/guides/testing-for-relationships-between-categorical-variables-using-the-chi-square-test)

Strip down to just numbers/similar - remove text

ex: transform messy weather data into just temperature

# turning the "sighterobservedweatherdata" column into just "temperature"
squircensus <- squircensus |>
  mutate(temperature = gsub("\\D", "", sighterobservedweatherdata)) #"\\D", "", does , uh, idk, it filled out blank but the next one fixed it
squircensus$temperature <- as.numeric(as.character(squircensus$temperature)) 
# some were input "~72-73", so:
squircensus <- squircensus |> filter(temperature <99) # now that I think about it this just removed those columns an didn't fix them lmao

Just remove text

textlesssquirrel <- squircensus %>%
  mutate(sighterobservedweatherdata = as.numeric(gsub("[^0-9]", "", sighterobservedweatherdata)))
# it has to be ^ within the brackets. ! does not work at all

Printing, counting, sums

Count number of distinct/non repeated/whatnot:

# authorcount = n_distinct(author)

Make a comparison column with `mutate`

crimes <- crimes |>
  mutate(antitransvsgay = antigaymale - antitransgender)

Filtering

Filter for repetition (multiple instances only) - using `n()`

Select only rows with duplicates (repeats in the column specified in group_by)

reptemps <-
  squircensus |> group_by(temperature) |> filter(n() >1) # group_by then pipe means search just this column, n() is number

Filter pieces of text - using `grep`

ex: select only squirrels with “mushroom” in their other notes

fungallyinclinedsquirrels <- subset(squircensus, grepl("mushroom", squircensus$otheractivities))
fungallyinclinedsquirrels

grepl:

Filter for absolute value

crimes |> filter(abs(antitransvsgay) > 0.5)
# it has to be written exactly like this: filter(abs(value) > #)

group_by, summarise, n()

Use `group_by()` on its own or pair with `summarise()`

Can also use reframe() instead of summarise()
I am not quite sure what the difference is, here is what the message says:

Warning: Returning more (or less) than 1 row per `summarise()` group was
deprecated in dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that
  `reframe()` always returns an ungrouped data frame and adjust
  accordingly.
Call `lifecycle::last_lifecycle_warnings()` to see where this
warning was generated.
`summarise()` has grouped output by 'anonymizedsighter'. You can
override using the `.groups` argument.

# something
squirred <- squircensus |>
  group_by(anonymizedsighter) |> # this will be the first column of your subset, if you follow with the rest of this chunk
  
  # it would help a lot to turn "anonymizedsighter" into character/categorical data for this
  
  reframe( # previously summairse() : it's like mutate, define new columns for subset
    squirrelsspotted = n(), # new column "squirrelsspotted" is the "n()" - number of times anonymizedsighter repeats
    shift = shift,
    hectare = hectare,
    location = location
  )

note: if you find that your grouped_by variable repeats, that’s because one of the columns you’re calling has at least one different row per one group_by row
if that doesn’t help, you can apply first() to the columns that are messing it up (eg, if “location” is the issue, location = first(location)) - this will take the first given row for that item in the “location” column and apply it to all (eg indexing)

Make new column with `mutate`

weirddebt <- debt |>
  mutate(hugeloan = (autoloan*studentloan)*1000000000000,
         meanmoney = mean((mortgage + creditcard)),
         meanmoney = mean((mortgage & creditcard)),
         # I'm going to be honest Idk what + or & really means in this particular situation
         )
# I don't know why you would do this but you can, you can do pretty much anything

Combine columns with `mutate`, `case_when`, or `if_else`

squirinfo <- squirinfo |>
  mutate(squirrel = case_when(
    !is.na(hectaresquirrelnumber) & !is.na(age) ~ 
      paste(hectaresquirrelnumber, age),
    TRUE ~ NA_character_
    ))
# you need case_when() if you want to do something like this, making it return NA in the new column when any (or specifics) of the called-upon columns are NA
squirinfo <- squirinfo |>
  mutate(squirrel = if_else(
    !is.na(hectaresquirrelnumber) & !is.na(age),
    paste(hectaresquirrelnumber, age),
      NA_character_
  ))
# these do the same thing in this instance, but case_when is more versatile or something (can handle more than just two thingies. also the fallback is written differently at the end there.)

Rename a single column

from Statology.org

# dplyr: specify column's birthname
crimes <- crimes %>% rename_at('antiage*', ~'antiage')
# dplyr: respectfully specify column by position
crimes <- crimes %>% rename_at(21, ~'antiunreligious')
# base r: column birthname
#colnames(crimes)[colnames(crimes) == 'old_name'] <- 'new_name'
# when you think about it, this method is actually more intuitive.
# base r: column position
#colnames(crimes)[25] <- 'antieasternorthodox'

Renaming with `mutate` and `case_when` (also NAs)

Categorical/character/qualitative:

# source: failed attempt plugged into ChatGPT
crimes <- crimes|>
    mutate(town = case_when(
    county %in% c("Albany", "Allegany") ~ "A towns",
    county %in% c("Bronx", "Broome") ~ "B towns",
    county %in% c("Cattaraugus", "Cayuga", "Chautauqua", "Chemung", "Chenango", "Clinton", "Columbia", "Cortland") ~ "C Towns",
    county %in% c("Dutchess") ~ "B Towns",
    TRUE ~ as.character(county)
  ))

Numeric/quantitative:

crimes <- crimes |> mutate(offenderconcentration = case_when(
  totaloffenders == 0 ~ "None",
  totaloffenders >= 1   & totaloffenders <= 5 ~ "Very low",
  totaloffenders >= 5  & totaloffenders <= 15 ~ "Low",
  totaloffenders >= 15  & totaloffenders <= 35 ~ "Medium",
  totaloffenders >= 35  & totaloffenders <= 50 ~ "Moderate",
  totaloffenders >= 50  & totaloffenders <= 70 ~ "High",
  totaloffenders >= 70  & totaloffenders <= 90 ~ "Super high",
  totaloffenders >= 90  & totaloffenders <= 200 ~ "Serious issues",
  TRUE ~ NA_character_
))

TRUE ~ at the end defines what to do with data that don’t match what’s specified by case_when.
TRUE ~ NA_character_ fills it with NA
TRUE ~ as.character(column) has it fill in exactly what it said in the column
and TRUE ~ "whatever" has it print what you specify in the quotes. if you don’t include this at all it’ll fill in NA

If_else vs case_when

case_when() and if_else() (a tidyverse version of ifelse()) are very similar

if_else() is the fastest (?)
if_else() only uses 2 values, one if and one else. case_when() can use more than 2, of any data type
supposedly one of the issues with ifelse() is that TRUE/FALSE values confuse it
it would appear that when you use case_when you use TRUE ~ ... as your fallback, while with if_else you write whatever you were going to write after TRUE ~ in case_when

Rearrange columns

crimes <- crimes |> relocate(anticatholic, .after = antimale)

Export the filtered dataset!!!!!!!! 😸

write_csv(squircensus, "SQUIRCENSUS.csv") # this is literally the best thing ever
write_csv(crimes, "CRIMES.csv")

Stat reminder

Stat Parameter
Measures of center
x̄ estimates µ

calculate means, median, etc - with NA

unique(squircensus$temperature)

 [1] 70 54 60 55 66 65 74 59 77 64 56 71 61 73 58 80 67 62 57 48 68 63 79 52 46
[26] 50 81 43 53 72 49 51 40 47 76 44 69 84 78 75 45 30

mean

function (x, ...) 
UseMethod("mean")
<bytecode: 0x12baf6890>
<environment: namespace:base>

seq()

seq(#1, #2, by#)

# make R list times tables
seq(7, 200, 7)

 [1]   7  14  21  28  35  42  49  56  63  70  77  84  91  98 105 112 119 126 133
[20] 140 147 154 161 168 175 182 189 196

seq(8, 200, 8)

 [1]   8  16  24  32  40  48  56  64  72  80  88  96 104 112 120 128 136 144 152
[20] 160 168 176 184 192 200

seq(9, 200, 9)

 [1]   9  18  27  36  45  54  63  72  81  90  99 108 117 126 135 144 153 162 171
[20] 180 189 198

The library lubridate is used for most of this…

Simple: create a dataframe with one value each

use tibble() instead of data.frame or data_frame
can always pivot to long format if you structured it weird

veryneg <- 495
neg <- 250
neut <- 115
pos <- 74
verypos <- 64
eh <- 87
totalneg <- veryneg+neg

df <- tibble(veryneg = veryneg, neg = neg, neut = neut, pos = pos, verypost = verypos, eh = eh)

df2 <- tibble(totalneg = totalneg, neut = neut, pos = pos, verypos = verypos, eh = eh)

I realized I wanted my data to be structured with these values in just one column, so instead of redoing I just pivoted to long format:

df2 <- df2 %>%
  pivot_longer(
    cols = c("totalneg", "neut", "pos", "verypos", "eh"),
    names_to = "opinions")
df2

# A tibble: 5 × 2
  opinions value
  <chr>    <dbl>
1 totalneg   745
2 neut       115
3 pos         74
4 verypos     64
5 eh          87

# percentage column - just because
df2 <- df2 %>%
  mutate(percentage = (value/sum(value)))

Plotting it just for fun

p2 <- df2 |>
  ggplot(aes(x = reorder(opinions, value), y = value)) +
  geom_bar(stat = "identity") +
  labs(x = "opinions", y = "share of population") +
  theme_bw()
p2

Involved example: generate specific, biased number sets

specify a range and assign it to a variable
use dnorm() with the parameters mean and sd (standard deviation) to generate the probabilities of each point in this range

# temperature column
temperature <- runif(n=365, min=-5, max=102)
# for this one I just don't care. give me some crazy values.

# windspeed column

# weighted (biased) random number generation
values <- 0:70 # this is the range of the wind speed
# this parts obviously normally done horizontally. :
weights <- dnorm( # normal Distribution
  values, # the 0:70 we just specified
  mean = 17, # the center of the *bell curve* will be 17
  sd = 7 
)

Explanation: the sd = 7 is the standard deviation. about ~68% of values fall within 1 standard deviation of the mean
- every standard deviation covers more and more of your range (more distance from the mean)
- because 17 is the mean, it takes up to 3 standard deviations to get to 0
- 7 standard deviations only just gets to 70
- 7 standard deviations covers everything but only the edges get the 40:70 range.
- the lesser standard deviations covering everything else (the 1:30 range) makes them occur much more

assign a weight - a likelihood - of each value 0:70 falling on this bell curve. aka value 17 may be 5% likely while 50 may be like .01% if you open weights, its all percentages

weights[values > 40] <- weights[values > 40] * 0.2
# does the command to the weights of values above 40 (like df$col <- df$col)
# it decreases (0.X%) of all values greater than 40 to 20% of themselves
# this is an 80% decrease
weights[values >= 23 & values <= 39] <- weights[values >= 25 & values <= 39] * 0.6 # values 27:39, 50% of themselves
weights[values >= 0 & values <= 3] <- weights[values >= 0 & values <= 5] * 0.4 # values 0:50, 65% less likely
# "normalize" weights so they all add up to 1 (100%)
weights <- weights / sum(weights)

# actually assign windspeed
windspeed <- sample(values, size = 365, 
                    replace = TRUE,  # dont worry about it
                    prob = weights)

# rainfall

values <- seq(0, 5, by=0.1) # this allows for decimals.!!!
weights <- dnorm(values, mean = 0.5, sd = 1) # small standard deviation to make most of it be small
weights[values >=3 & values <= 5] <- weights[values >=3 & values <= 5] * 0.5
# make really heavy rainfall 50% less likely
weights <- weights / sum(weights)

# actually add rainfall
rainfall <- sample(values, size = 365, replace = TRUE, prob = weights)

Add together into a dataframe

oneyear <- tibble( # you use tibble instead of data.frame or data_frame
  temperature = temperature, windspeed = windspeed, rainfall = rainfall)

Add a year of dates

oneyear <- oneyear |>
  mutate(date = seq.Date( # creates full year of dates
    from = as.Date("2023-01-01"), # honestly I dont get it I had chatgpt write this line and I'm so tired I don't really feel like figuring this part out right now
    by = "day", length.out = n())) # length of this data is n()

# alternative format
w <- seq.Date(from = as.Date("2023-01-01"), 
              to = as.Date("2023-12-31"), by = "day")

write_csv(oneyear, "oneyear.csv")

oneyear |>
  ggplot(aes(x = date, y = rainfall)) +
  geom_area(stat = "identity", fill = "#345573") +
  geom_bar(stat = "identity", fill = "#010A86", alpha = (rainfall)/3) +
  labs(x = "Date",
       y = "Rainfall (inches)",
       title = "Rainfall in Washington during year 2025") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

oneyear |>
  ggplot(aes(x = date, y = temperature, color = temperature)) +
  geom_line(stat = "identity") +
  labs(x = "Date",
       y = "Temperature (°F)",
       title = "Temperature in Washington during year 2025") +
  theme_minimal() +
  scale_color_gradient(low = "#E292CC", high = "#DC0000", aesthetics = "color") +
  guides(color = "none") +
  theme(plot.title = element_text(hjust = 0.5))

oneyear |>
  ggplot(aes(x = date, y = windspeed)) +
  geom_area(aes(color = "#A08DC9", alpha = (rainfall)/100)) +
  geom_jitter(alpha = 0.5, color = "#321346") +
  geom_jitter(alpha = 0.2, color = "#040128") +
  geom_point(alpha = 0.7, color = "#070052") +
  geom_line(alpha = (rainfall)/9) +
  labs(x = "Date",
       y = "Windspeed (mph)",
       title = "Windspeed in Oregon during year 2025") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

#+  scale_alpha(guide = "none") # idk man

I feel like you just figure this out as you go along

do not use na.omit or drop.na on an entire dataset

Use `!is.na`

# example from squirrel project
subsetsquirrel <- squircensus[!is.na(squircensus$highlightfurcolor),]
subsetsquirrel <- squircensus[!is.na(squircensus$otheractivities),]
# the comma is necessary, and so is assigning the variable

squircensus |>
  filter(!is.na(highlightfurcolor) & !is.na(otheractivities))

squircensus |> 
  filter(!is.na(highlightfurcolor) & !is.na(otheractivities))

ggplot basics

Defining the variables:
- ggplot(aes(x = column1, y = column2))
if you want to use a mean for either of these, you must assign the mean to a specific separate variable and call on that (eg: themean <- mean(data$col), aes(x=themean)), or else it is most likely to fail or kill your render attempt and make you cry.

Color in ggplot2

nondiscriminatory outlines: aes() with just x & y, then geom_bar(stat = "identity", color = "#9871A8")
- color refers to the outline, and fill to the inside; to fill the bars in a color, type fill = "#5612D6". can be used with or without color also specified
discriminatory color (different by variable): ggplot(aes(x = col1, y = col2, color = col3)) OR fill = col3. fill or color depends on the geom type.
- next, enter scale_fill_ or scale_color_ on its own line to enter which color
- scale_fill_brewer() pairs with RColorBrewer library, scale_fill_viridis() for colorblind friendly palettes
- scale_fill_manual() or scale_color_manual assigns specific colors to specific variables in the plot.
- another option for using scale_fill_manual() is to assign your colors to a variable as a vector, and call on them like: scale_fill_manual(values = acolors)
- scale_fill_gradient() creates a gradient; scale_fill_gradient2() for a multiple value gradient

# manual example
reptemps |>
  ggplot(aes(x = location, y = temperature, fill=location, alpha = temperature)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("Ground Plane" = "#790033",
                              "Above Ground" = "#ee3a83"))

# if you don't specify the color for NA it will be grey

# gradient example
squirrelplot <- reptemps |>
  ggplot(aes(x = numberofsquirrels, y = temperature, color = temperature)) +
  geom_point(alpha = 0.3)+
  scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color")
squirrelplot

# a painfully familiar graph.

alpha is transparency
- assign it to a variable inside of the aes() to have it be variable-dependent
- assign it to a number inside of the geom_type() to have it apply the same to all
- like in this graph, it can be used for all points to show concentration when points overlap

Cuter ggplot2 aesthetics

use labs to specify the labels for each thingy
add \ after a label to add a space
theme

squirrelplot2 <- squirrelplot +
  labs(x = "number of squirrels",
       y = "temperature (°Fahrenheit)",
       title = "squirrels seen by temperature recorded",
       caption = "thesquirrelcensus",
       color = "TEMPERATURE °F",
       alpha = "whatever man") + # name the keys based on what they were. color, fill, size, alpha, etc
    geom_text(aes( # labels
      alpha = temperature, # text transparency matches dots
      label = round(temperature, 2)), # round is unnecessary in this case
      hjust = -.5, # or vjust
      color = "#790033", 
      size = 2.5) +
    geom_text(aes(x=7.7, y=90, # coordinates
                  label="squirrls"), cex=14, # size(?)
              color="black", angle = 0.8)  +
  geom_text(aes(x=16, y=40, # coordinates
                  label="nunber of thesquirrelcensus"), cex=8, # size(?)
              color="black", angle = -0.8)  +
  theme_gray(base_size = 12, base_family = "serif") # many different choices, library "ggthemes" adds more
squirrelplot2

To adjust the text alignment of title, labels, etc:

Specific color to variable with `scale_whatever_manual`

# subset: narrow down to one year & crime type
crime2016 <- crimes |>
  filter(year == "2016", crimetype == "Crimes Against Persons")

ggplot(crime2016) +
    geom_bar(aes(y = county, x = totalincidents, 
                # instead of a color/fill, name the column/variable
                 fill = "Total incidents of all crime"), 
             color = "#FFCB50", stat = "identity", alpha = 0.1) +
  geom_bar(aes(y = county, x = antigaymale, 
               fill = "Anti gay male crime"), 
           color = "#3D3387", stat = "identity", alpha = 0.5) +
    geom_bar(aes(y = county, x = antijewish, 
                 fill = "Anti Jewish crime"), 
             color = "#ef4c2f", stat = "identity", alpha = 0.5) +
  # now use scale_fill_manual (or scale_color_manual) to define the above "colors"
  scale_fill_manual(values = c(
    "Total incidents of all crime" = "#FFCB50",
    "Anti Jewish crime" = "#ef4c2f",
    "Anti gay male crime" = "#3D3387"
  )) +
  theme_minimal() +
  labs(
    title = "Most common hate crime committed in 2016, by city in NY",
    x = "Incidents of hate crime",
    y = "", # looks nicer without the "city" label, and I think y-axis is obvious
    fill = "Crime"
  )

# example 2
  scale_fill_manual((name="Regression Model",
                     breaks=c("race", "sentencelength", "Cubic"),
                     values=c(
                     "Cubic"="pink",
                     "Quadratic="blue",
                     "Linear"="purple")))

I have no idea what that one means but its in my Incarceration project I guess for some reason(??) I am including it in case it becomes useful

Histogram and bar graphs in depth

# I don't know what I'm doing
reptemps |>
  ggplot(aes(x = numberofsquirrels, color = temperature)) +
  geom_histogram(color = "blue")+
  scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color")

geom_bar(stat = "fill") does some cool stuff!

# whatever

Lollipop chart in ggplot2

A lollipop chart requires a categorical and numeric value, like a barchart. It is used to compare values to an overall value, like comparing the amount of crime in different cities to the average (or median) crime for all cities in the country.

It can be ideal to subset your data first to make it easier to know what you’re doing. You also need to assign the midline/baseline to a separate variable because you can’t call on a column inside aes()

The lollipop chart is a combination of three ggplot styles: geom_point for the dots, geom_segment for the sticks, and geom_hline or geom_vline for the midline (or whatever its called).

# define midline, the baseline mean/average.
offenderavg <- mean(crime2016$totaloffenders)

crime2016 |>
  ggplot(aes(x = county, y = totaloffenders, color = totaloffenders)) +
  geom_segment(aes(x = county,
                   y = offenderavg,
                   xend = county,
                   yend = totaloffenders)) + # creates lines leading to each dot
  geom_point(size = 4.5,
             alpha = 0.8) + 
  scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color") +
  geom_hline(yintercept = offenderavg, color = "#3d3b3c", size = .5) + # creates the main horizontal line
  labs(x = "County",
       y = "Total offenders",
       title = "2016 hate crime offenders by county compared to NY average",
       caption = "Source: NY State Division of Criminal Justice Services",
       color = "total offenders") +
  theme_minimal(base_size = 12, base_family = "sans") +
  geom_text(aes(x=9, y=86, label="Kings"), color="#56122e", size = 3.5)  +
  geom_text(aes(x=14, y=87, label="New York"), color="#56122e", size = 3.5)  +
  theme(axis.text.x=element_blank()) # remove x-axis labels for cleanness

Simple adjusted lollipop:

crime2016 |>
  ggplot(aes(x = reorder(county, totaloffenders), # reorder/rearrange by value; list county sorted by totaloffenders
             y = totaloffenders, color = totaloffenders)) +
  geom_segment(aes(x = county,
                   y = offenderavg,
                   xend = county,
                   yend = totaloffenders)) +
  geom_point(size = 4,
             alpha = 0.8) + 
  scale_color_gradient(low = "#790033", high = "#ee3a83", aesthetics = "color") +
  geom_hline(yintercept = offenderavg, color = "#3d3b3c", size = .5) +
  labs(x = "County",
       y = "Total offenders",
       title = "2016 hate crime offenders by county compared to NY average",
       caption = "Source: NY State Division of Criminal Justice Services",
       color = "total offenders") +
  theme_minimal(base_size = 12, base_family = "sans") +
  theme(axis.text.x=element_blank())

To flip the points to sort the other way, add a negative right before the value you’re sorting by: aes(x=(reorder(county, -totaloffenders)

sometimes the lollipop chart displays 2 dots for one value. this happens if there are duplicates. to remove them, filter so there’s no duplicates

Further customization

to make a vertical line instead, type geom_vline and specify xintercept = variable.
I have not yet been able to figure out how to arrange the points in order. it fails and fails and fails.
I have also not yet figured out how to change the colors of the lines and points separately, but you can probably do that by specifying that in aes()

Text labels:
- you can add , "%" or similar within paste in a geom_text(aes) to have it print that.
- use round() to round up. round(col, 1) round to .1st place, round(col, 2) round to .02nd place, round(col, 0) to fully round up.
- if you have a percentage in tiny decimal format, type column * 100 to bring it to percentage format.

p2 +
  geom_text(aes(label = 
                  paste0( # needed to make the % work
                    round(percentage * 100, 1), # perc column, times 100 to bring it from a decimal to a percentage. , round to the 1st place.
                    "%")), # paste % after
              size = 3, vjust = -0.4) +
  geom_segment(aes(
    x = 1,
    y = 130,
    xend = 1,
    yend = 230
  ), size = 0.4, linetype = "longdash") +
  geom_segment(aes(
    x = 2,
    y = 150,
    xend = 2,
    yend = 230
  ), size = 0.4, linetype = "longdash") +
  geom_segment(aes(
    x = 1,
    y = 230,
    xend = 2,
    yend = 230
  ), size = 0.4, linetype = "longdash") +
  # text label
  geom_text(aes(x = 1.5, y = 260, label = "12.7% positive"), size = 3) +
  # second set
  geom_segment(aes(x = 3, y = 150, xend = 3, yend = 260), size = 0.4, linetype = "longdash") +
  geom_segment(aes(x = 4, y = 180, xend = 4, yend = 260), size = 0.4, linetype = "longdash") +
  geom_segment(aes(x = 3, y = 260, xend = 4, yend = 260), size = 0.4, linetype = "longdash") +
  # text label
  geom_text(aes(x = 3.5, y = 290, label = "18.6% ambivalent"), size = 3) +
  geom_segment(aes(x = 1.5, y = 300, xend = 1.5, yend = 450), size = 0.4, linetype = "longdash")+
  geom_segment(aes(x = 3.5, y = 330, xend = 3.5, yend = 450), size = 0.4, linetype = "longdash") +
  geom_segment(aes(x = 1.5, y = 450, xend = 3.5, yend = 450), size = 0.4, linetype = "longdash") +
  geom_text(aes(x = 2.5, y = 480, label = "31.3% non-negative"), size = 3) +
  geom_segment(aes(x = 3, y = 510, xend = 3, yend = 610), size = 0.4, linetype = "longdash") +
  # a line underneath the other lines for cuteness
  geom_segment(aes(x = 3, y = 330, xend = 3, yend = 420), size = 0.4, linetype = "longdash") +
  geom_segment(aes(x = 4.4, y = 610, xend = 3, yend = 610), size = 0.4, linetype = "longdash") +
  geom_text(aes(x = 3.75, y = 640, label = "87.3% non-positive"), size = 3)

**paste() vs paste0()?
- paste() adds a space by default.
- paste0() means no space.

Line chart/trendlines (overlapped plots)

ggplot legends and other customization

Remove legend

squirrelplot2 +
  scale_color_gradient(low = "#83D0CB", high = "#115278", aesthetics = "color") +
  theme(legend.position = c(.9, .4)) +
  # remove the alpha legend
  scale_alpha(guide = "none")

Make TRANSPARENT

transparentplot <- squirrelplot2 +
  theme(
    panel.background = element_rect(fill = "transparent"),
    plot.background = element_rect(fill = "transparent", color = NA)
  )

ggsave("squirrelplot2.png", squirrelplot2, bg='transparent')

ggsave('transparent2.png', transparentplot, bg='transparent')

squirrelplot2 +
  theme(
    panel.background = element_rect(fill = "transparent", color = NA),
    plot.background = element_rect(fill = "transparent", color = NA),
    legend.background = element_rect(fill = "transparent", color = NA),
    legend.box.background = element_rect(fill = "transparent", color = NA),
    panel.grid = element_blank()
  )

  theme(
    panel.background = element_rect(fill='transparent'), #transparent panel bg
    plot.background = element_rect(fill='transparent', color=NA), #transparent plot bg
    panel.grid.major = element_blank(), #remove major gridlines
    panel.grid.minor = element_blank(), #remove minor gridlines
    legend.background = element_rect(fill='transparent'), #transparent legend bg
    legend.box.background = element_rect(fill='transparent') #transparent legend panel
  )

Stacking geom_plot types

If you specify the aes(x, y) of a plot inside of geom_ instead of ggplot you can stack plots.

# ridiculous fake yearly weather data
tempmean <- mean(temperature)
rainmean <- mean(rainfall)
windmean <- mean(windspeed)
negrain <- -rainfall
negtemp <- -temperature
negwind <- -windspeed
tripwind <- windspeed*3
doubrain <- rainfall*2
doubtemp <- temperature*2

ggplot(oneyear) +
 # geom_area(aes(x = windspeed, y = rainfall), fill = "#f9dbbd") +
#  geom_area(aes(x = tripwind, y = rainfall)) +
    geom_line(aes(x = temperature, y = rainfall), color = "#a53860") +  # lmfao
  geom_segment(aes(x = temperature,
               y = rainfall,
               xend = tempmean,
               yend = rainmean), color = "#450920", alpha = rainfall) +
  geom_line(aes(x = rainfall, y = rainfall), color = "#ffa5ab") +
  labs(title = "One year of rainfall in Oregon") +
  theme(
    plot.title = element_text(hjust = 5, face = "bold")) +
  theme_void()

  # geom_segment(aes(x = windspeed,
  #                  y = rainfall,
  #                  xend = windmean,
  #                  yend = rainfall))

Pie charts (which you must never use)

Highcharter basics

begin each plot with highchart() or hchart()
hc_add_series()
don’t even know how color works
figure it out yourself
chatgpt is not going to help you with this
if you don’t understand anything about java it’s best to give up on highcharter and find another library (a note to self)

Density plot

as mentioned in ggplot2 section, a density plot shows the distribution and frequency

taken straight from data final:

# subset
tempsquirrel <- squircensus |>
  select(temperature, numberofsquirrels, primaryfurcolor) |>
  arrange(primaryfurcolor)

# subset further
greysquirrel <- tempsquirrel |> filter(primaryfurcolor == "Gray")
blacksquirrel <- tempsquirrel |> filter(primaryfurcolor == "Black")
cinnamonsquirrel <- tempsquirrel |> filter(primaryfurcolor == "Cinnamon")

hchart(
  # first density plot (temperature)
  density(tempsquirrel$temperature), type = "area",
  color = "#ad755a",
  name = "Temperature distribution") |>
  # second density plot (black squirrels)
    hc_add_series(
    density(blacksquirrel$temperature), type = "area",
    color = "#000",
    name = "Black Squirrel") |>
  # third density plot (grey squirrels)
    hc_add_series(
    density(greysquirrel$temperature), type = "area",
    color = "#c5bdc9", 
    name = "Grey Squirrel") |>
  # fourth density plot (cinnamon squirrels)
      hc_add_series(
  density(cinnamonsquirrel$temperature), 
  type = "area", name = "Cinnamon Squirrel", color = "#7f3300") |>
  # text features
  hc_title(text="Squirrel Activity in Different Temperatures by Color",
           margin = 30,
    align = "center",
    style = list(color = "#4c2918") ) |>
  hc_subtitle(text="Source: The Squirrel Census",
              style = list(color = "#594135")) |>
  hc_xAxis(title = list(text="Temperature (°F)",
                        margin = 5,
                        style = list(color = "#594135")))

Lollipop

I am unclear on how even to include the baseline

crimeconcentration <- crimes |>
  filter(year == "2015", crimetype == "Property Crimes") |>
  select(county, year, offenderconcentration, 
         "antigaymale", "antijewish",
         totalincidents, totalvictims, totaloffenders, crimetype) |>
  filter(antijewish > 0)

highchart() |>
  hc_add_series(data = crimeconcentration,
                type = "lollipop", hcaes(x = county,
                                     y = "antijewish",
                                     group = offenderavg)) |>
 # hc_colors(acolors) |>
  hc_xAxis(title = list(text="County")) |>
  hc_yAxis(title = list(text="Number of anti-jewish hatecrimes")) |>
  hc_subtitle(text="NY State Division of Criminal Justice Services") |>
  hc_title( # from rdrr.io
    text = "<b>NY counties by anti Jewish hate crimes in 2015</b>",
    margin = 30,
    align = "center",
    style = list(color = "#3c3d3c")
  )

Bar

leafs

basics….

theres like multiple types of functions
in R, you need brackets