2026 R Notes

# general
library(tidyverse)

# plotting
library(ggplot2)

# date, time
library(lubridate)
library(zoo)

# statistical modeling
library(party)

crimes <- read_csv("hateCrimes2010.csv")
squirinfo <- read_csv("centralparksquirrelcensus.csv")
squirhectare <- read_csv("centralpartsquirrel_hectare.csv")

names(crimes) <- tolower(names(crimes))
names(crimes) <- gsub(" ","",names(crimes))
names(crimes) <- gsub("-","",names(crimes)) # remove troublesome symbols

names(squirinfo) <- tolower(names(squirinfo))
names(squirinfo) <- gsub(" ","",names(squirinfo))
names(squirhectare) <- tolower(names(squirhectare))
names(squirhectare) <- gsub(" ","",names(squirhectare))

squircensus <- left_join(squirhectare, squirinfo) # it joined by hectare

Removing rows/columns

# remove a ton of different ranges of rows
brokensquirrel <- squircensus[-c(3:66, 68:87, 90:757, 760:1000, 1001:2536, 2540:3102), ]
# remove just one row
#brokensquirrel <- brokensquirrel[-(5:10)]

Removing columns

# remove columns
brokensquirrel <- brokensquirrel |> select(-sighterobservedweatherdata, -litternotes, -otheranimalsightings, -hectareconditions)
# select to ONLY keep certain columns
brokensquirrel <- brokensquirrel |> select(litter, totaltimeofsighting, otheractivities, otherinteractions)
brokensquirrel

# A tibble: 10 × 4
   litter   totaltimeofsighting otheractivities         otherinteractions
   <chr>                  <dbl> <chr>                   <chr>            
 1 Some                      22 <NA>                    <NA>             
 2 Some                      22 <NA>                    <NA>             
 3 None                      20 <NA>                    <NA>             
 4 Abundant                  25 <NA>                    couldn't get near
 5 Abundant                  25 <NA>                    <NA>             
 6 None                      25 <NA>                    <NA>             
 7 Some                      30 <NA>                    <NA>             
 8 Some                      25 <NA>                    <NA>             
 9 Some                      25 jumping                 stared at me     
10 Some                      25 laid down on the branch <NA>

# only 4 columns - and only 10 rows, because of previous chunk

Removing all numbers, text, whatever

Remove everything except numbers from a weather column

# turning the "sighterobservedweatherdata" column into just "temperature"
squircensus <- squircensus |>
  mutate(temperature = gsub("\\D", "", sighterobservedweatherdata)) #"\\D", "", does , uh, idk, it filled out blank but the next one fixed it
squircensus$temperature <- as.numeric(as.character(squircensus$temperature)) 
# some were input "~72-73", so:
squircensus <- squircensus |> filter(temperature <99) # now that I think about it this just removed those columns an didn't fix them lmao

Just remove text

textlesssquirrel <- squircensus %>%
  mutate(sighterobservedweatherdata = as.numeric(gsub("[^0-9]", "", sighterobservedweatherdata)))
# it has to be ^ within the brackets

Making new columns (`mutate`)

Make a comparison column with `mutate`

crimes <- crimes |>
  mutate(antiwomenvmen = antifemale - antimale) # new column = old column 1 - old column 2

Subsetting

Making a summary column using `group_by` and `summarise`

(23.4+118+125+96+88+76+120)/700

[1] 0.9234286

Filtering

Filter for repetition (multiple instances only) - using `n()`

Select only rows with duplicates (repeats in the column specified in group_by)

reptemps <-
  squircensus |> group_by(temperature) |> filter(n() >1) # group_by then pipe means search just this column, n() is number

Filter pieces of text - using `grep`

ex: select only squirrels with “mushroom” in their other notes

fungallyinclinedsquirrels <- subset(squircensus, grepl("mushroom", squircensus$otheractivities))
fungallyinclinedsquirrels

# base R
fungallyinclinedsquirrels <- squircensus[grepl("mushroom", squircensus$otheractivities),]

grepl:

Filter for absolute value

crimes |> filter(abs(antiwomenvmen) > 0.5)
# it has to be written exactly like this: filter(abs(value) > #)

Printing, counting, sums

Unique

List all unique

unique(squircensus$totaltimeofsighting)

 [1] 22 26 23 25 35 20 24 28 27 30 15 31 46 39 NA 65 29 36 45 42 50 44 34 21 64
[26] 18 55 40 37 33 19 43 10 17  5  1 67 32 58

List all unique as a table

table(unique(squircensus$totaltimeofsighting))


 1  5 10 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39 
 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
40 42 43 44 45 46 50 55 58 64 65 67 
 1  1  1  1  1  1  1  1  1  1  1  1

Total number of unique

length(unique(squircensus$totaltimeofsighting))

[1] 39

Much better version of the table

uniquetimes <- squircensus %>% 
  group_by(totaltimeofsighting) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count))
uniquetimes

# A tibble: 39 × 2
   totaltimeofsighting count
                 <dbl> <int>
 1                  20   677
 2                  25   523
 3                  30   261
 4                  22   198
 5                  23   183
 6                  21   105
 7                  26    94
 8                  27    84
 9                  35    82
10                  24    78
# ℹ 29 more rows

Tabling unique supposedly can also do this (base R): table(dat$Marital_status, dat$approval_status) (source https://www.pluralsight.com/resources/blog/guides/testing-for-relationships-between-categorical-variables-using-the-chi-square-test)

Count number of distinct/non repeated/whatnot:

# authorcount = n_distinct(author)

from https://stackoverflow.com/questions/19379081/how-to-replace-na-values-in-a-table-for-selected-columns, replace NA with something else based on specific condition (as in replace NA in columns, or only when column is numeric, whatever):

by column type:

x %>% mutate_if(is.numeric, ~replace_na(., 0))

select columns defined in vars(col1, col2, …):

x %>% mutate_at(vars(a, b, c), ~replace_na(., 0))

all columns:

x %>% mutate_all(~replace_na(., 0))