# general
library(tidyverse)
# plotting
library(ggplot2)
# date, time
library(lubridate)
library(zoo)
# statistical modeling
library(party)2026 R Notes
crimes <- read_csv("hateCrimes2010.csv")
squirinfo <- read_csv("centralparksquirrelcensus.csv")
squirhectare <- read_csv("centralpartsquirrel_hectare.csv")names(crimes) <- tolower(names(crimes))
names(crimes) <- gsub(" ","",names(crimes))
names(crimes) <- gsub("-","",names(crimes)) # remove troublesome symbols
names(squirinfo) <- tolower(names(squirinfo))
names(squirinfo) <- gsub(" ","",names(squirinfo))
names(squirhectare) <- tolower(names(squirhectare))
names(squirhectare) <- gsub(" ","",names(squirhectare))
squircensus <- left_join(squirhectare, squirinfo) # it joined by hectareRemoving rows/columns
# remove a ton of different ranges of rows
brokensquirrel <- squircensus[-c(3:66, 68:87, 90:757, 760:1000, 1001:2536, 2540:3102), ]
# remove just one row
#brokensquirrel <- brokensquirrel[-(5:10)]Removing columns
# remove columns
brokensquirrel <- brokensquirrel |> select(-sighterobservedweatherdata, -litternotes, -otheranimalsightings, -hectareconditions)
# select to ONLY keep certain columns
brokensquirrel <- brokensquirrel |> select(litter, totaltimeofsighting, otheractivities, otherinteractions)
brokensquirrel# A tibble: 10 × 4
litter totaltimeofsighting otheractivities otherinteractions
<chr> <dbl> <chr> <chr>
1 Some 22 <NA> <NA>
2 Some 22 <NA> <NA>
3 None 20 <NA> <NA>
4 Abundant 25 <NA> couldn't get near
5 Abundant 25 <NA> <NA>
6 None 25 <NA> <NA>
7 Some 30 <NA> <NA>
8 Some 25 <NA> <NA>
9 Some 25 jumping stared at me
10 Some 25 laid down on the branch <NA>
# only 4 columns - and only 10 rows, because of previous chunkRemoving all numbers, text, whatever
Remove everything except numbers from a weather column
# turning the "sighterobservedweatherdata" column into just "temperature"
squircensus <- squircensus |>
mutate(temperature = gsub("\\D", "", sighterobservedweatherdata)) #"\\D", "", does , uh, idk, it filled out blank but the next one fixed it
squircensus$temperature <- as.numeric(as.character(squircensus$temperature))
# some were input "~72-73", so:
squircensus <- squircensus |> filter(temperature <99) # now that I think about it this just removed those columns an didn't fix them lmaoJust remove text
textlesssquirrel <- squircensus %>%
mutate(sighterobservedweatherdata = as.numeric(gsub("[^0-9]", "", sighterobservedweatherdata)))
# it has to be ^ within the bracketsMaking new columns (mutate)
Make a comparison column with mutate
crimes <- crimes |>
mutate(antiwomenvmen = antifemale - antimale) # new column = old column 1 - old column 2Subsetting
Making a summary column using group_by and summarise
(23.4+118+125+96+88+76+120)/700[1] 0.9234286
Filtering
Filter for repetition (multiple instances only) - using n()
Select only rows with duplicates (repeats in the column specified in group_by)
reptemps <-
squircensus |> group_by(temperature) |> filter(n() >1) # group_by then pipe means search just this column, n() is numberFilter pieces of text - using grep
ex: select only squirrels with “mushroom” in their other notes
fungallyinclinedsquirrels <- subset(squircensus, grepl("mushroom", squircensus$otheractivities))
fungallyinclinedsquirrels
# base R
fungallyinclinedsquirrels <- squircensus[grepl("mushroom", squircensus$otheractivities),]grepl:
Filter for absolute value
crimes |> filter(abs(antiwomenvmen) > 0.5)
# it has to be written exactly like this: filter(abs(value) > #)Printing, counting, sums
Unique
List all unique
unique(squircensus$totaltimeofsighting) [1] 22 26 23 25 35 20 24 28 27 30 15 31 46 39 NA 65 29 36 45 42 50 44 34 21 64
[26] 18 55 40 37 33 19 43 10 17 5 1 67 32 58
List all unique as a table
table(unique(squircensus$totaltimeofsighting))
1 5 10 15 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
40 42 43 44 45 46 50 55 58 64 65 67
1 1 1 1 1 1 1 1 1 1 1 1
Total number of unique
length(unique(squircensus$totaltimeofsighting))[1] 39
Much better version of the table
uniquetimes <- squircensus %>%
group_by(totaltimeofsighting) %>%
summarize(count = n()) %>%
arrange(desc(count))
uniquetimes# A tibble: 39 × 2
totaltimeofsighting count
<dbl> <int>
1 20 677
2 25 523
3 30 261
4 22 198
5 23 183
6 21 105
7 26 94
8 27 84
9 35 82
10 24 78
# ℹ 29 more rows
Tabling unique supposedly can also do this (base R): table(dat$Marital_status, dat$approval_status) (source https://www.pluralsight.com/resources/blog/guides/testing-for-relationships-between-categorical-variables-using-the-chi-square-test)
Count number of distinct/non repeated/whatnot:
# authorcount = n_distinct(author)from https://stackoverflow.com/questions/19379081/how-to-replace-na-values-in-a-table-for-selected-columns, replace NA with something else based on specific condition (as in replace NA in columns, or only when column is numeric, whatever):
by column type:
x %>% mutate_if(is.numeric, ~replace_na(., 0))
select columns defined in vars(col1, col2, …):
x %>% mutate_at(vars(a, b, c), ~replace_na(., 0))
all columns:
x %>% mutate_all(~replace_na(., 0))