Base Functions in R

17 + 18

## [1] 35

5 * 4

## [1] 20

100/15

## [1] 6.666667

4^3

## [1] 64

# Statistics (percentage below a z-score of -1)
pnorm(q = -1, mean = 0, sd = 1, lower.tail = TRUE)

## [1] 0.1586553

## Percentage below a z-score of -3, -2, -1, 0, 1, 2, and 3
pnorm(q = c(-3,-2,-1,0,1,2,3), mean = 0, sd = 1, lower.tail = TRUE)

## [1] 0.001349898 0.022750132 0.158655254 0.500000000 0.841344746 0.977249868
## [7] 0.998650102

# Using variables
a <- 3
b <- 4
c <- sqrt(a^2 + b^2); c

## [1] 5

a <- c(3, 6, 9, 12, 15)
b <- c(4, 8, 12, 16, 20)
perfect.hyps <- sqrt(a^2 + b^2)
perfect.hyps

## [1]  5 10 15 20 25

# Using variables in R's base functions to compare scores on different standardized tests

## ACT
pop.mu = 20.8
pop.sd = 5.8

my.score <- 30

pnorm(q = my.score, mean = pop.mu, sd = pop.sd)

## [1] 0.9436538

### Note that you do not need to include the parameter names in the function call
pnorm(my.score, pop.mu, pop.sd)

## [1] 0.9436538

## GRE
pop.mu <- 150
pop.sd <- 8.6

my.scores <- c(158, 160, 165)

pnorm(my.scores, pop.mu, pop.sd)

## [1] 0.8238747 0.8775428 0.9594367

Load Required Packages and Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

RawData <- read.csv("RWorkshop.csv")

Data Cleaning

Remove Unnecessary Rows

# Removing rows 1 through 2 - Note the comma after the parentheses
CleanerData <- RawData[-c(1:2),]

Remove Unnecessary Columns

# Retaining only the columns that we need
CleanData <- CleanerData %>%
  select(FirstName, LastName, Region, Birthday.1_1, Birthday.2_1, Birthday.3_1, AgeYears, HeightInches)

Calculating Length of Names

# Calculating the number of characters (letters) in each name
Name.Data <- CleanData %>%
  mutate(FirstNameLength = nchar(FirstName),
         LastNameLength = nchar(LastName))

Simplifying `Region`

# Removing states from the region values
Region.Data <- Name.Data %>%
  mutate(Region = case_when(startsWith(Region, "Northeast") ~ "Northeast",
                            startsWith(Region, "Midwest") ~ "Midwest",
                            startsWith(Region, "Southeast") ~ "Southeast",
                            startsWith(Region, "Southwest") ~ "Southwest",
                            startsWith(Region, "West") ~ "West",
                            startsWith(Region, "International") ~ "International"))

Simplifying `Birthday`

# Combining the three birthday variables into a single one and converting to R date format; removing the original (now unnecessary) birthday variables
Bday.Data <- Region.Data %>%
  mutate(Birthday = paste0(Birthday.1_1, " ", Birthday.2_1, ", ", Birthday.3_1)) %>%
  mutate(Birthday = as.Date(Birthday, format = "%B %d, %Y")
         ) %>%
  select(-Birthday.1_1, -Birthday.2_1, -Birthday.3_1)

Calculating Age in Days and Height in CMs

# Subtracting current date from birthday to calculate age in days as a numeric variable; ensuring height is numeric and converting height from feet to inches for individuals who accidentally reported their height as feet (since it is less than 7); converting height in inches to height in CMs; removing individuals who have an NA for height in cms or age in days (our main DVs)
FinalData <- Bday.Data %>%
  mutate(AgeDays = as.numeric(difftime(Sys.Date(), Birthday, units = "days")),
         HeightInches = as.numeric(HeightInches),
         HeightInches = ifelse(HeightInches < 7, HeightInches*12, HeightInches),
         AgeYears = as.numeric(AgeYears),
         HeightCMs = HeightInches*2.54) %>%
  filter(!is.na(HeightCMs) | !is.na(AgeYears))

Data Anlysis and Visualization

Comparing Height in CM and Height in Inches

# Correlation
cor(FinalData$HeightCMs, FinalData$HeightInches, method = "pearson")

## [1] 1

# Linear Regression
model1 <- glm(HeightCMs ~ HeightInches, data = FinalData)
summary(model1)

## 
## Call:
## glm(formula = HeightCMs ~ HeightInches, data = FinalData)
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  7.626e-14  1.299e-13 5.87e-01    0.565    
## HeightInches 2.540e+00  1.985e-15 1.28e+15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8.077936e-28)
## 
##     Null deviance: 1.3229e+03  on 19  degrees of freedom
## Residual deviance: 1.4540e-26  on 18  degrees of freedom
## AIC: -1187
## 
## Number of Fisher Scoring iterations: 1

Visualizing this Relationship

# Density of predictor variable (IV)
FinalData %>%
  ggplot(aes(x = HeightInches)) +
  geom_density(fill = "#BDD7E7") + theme_minimal() +
  labs(title = "Height in Inches of Summer 2024 Interns",
         x = "Height in Inches")

# Linear relationship of IV and DV
FinalData %>%
  ggplot(aes(x = HeightInches, y = HeightCMs)) +
  geom_point() +
  geom_smooth(method = "lm")+
  labs(title = "The Perfect Linear Relationship between Height in CMs and Height in Inches",
       subtitle = "\U03B2 = 2.54",
       x = "Height in Inches",
       y = "Height in CMs") +
  theme_minimal()

Comparing Age in Days and Age in Years

# Correlation
cor(FinalData$AgeDays, FinalData$AgeYears, method = "pearson")

## [1] 0.9852028

# Linear Regression
model2 <- glm(AgeDays ~ AgeYears, data = FinalData)
summary(model2)

## 
## Call:
## glm(formula = AgeDays ~ AgeYears, data = FinalData)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   635.61     319.61   1.989   0.0622 .  
## AgeYears      360.66      14.79  24.388 3.06e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 11580.59)
## 
##     Null deviance: 7096066  on 19  degrees of freedom
## Residual deviance:  208451  on 18  degrees of freedom
## AIC: 247.79
## 
## Number of Fisher Scoring iterations: 2

Visualizing this Relationship

# Density of predictor variable (IV)
FinalData %>%
  ggplot(aes(x = AgeYears)) +
  geom_density(fill = "#BDD7E7") + theme_minimal() +
  labs(title = "Age in Years of Summer 2024 Interns",
         x = "Age in Years")

# Linear relationship of IV and DV
FinalData %>%
  ggplot(aes(x = AgeYears, y = AgeDays)) +
  geom_point() +
  geom_smooth(method = "lm")+
  labs(title = "Age in Years by Age in days",
       subtitle = "\U03B2 ≈ 365",
       x = "Age in Years",
       y = "Age in Days") +
  theme_minimal()

Comparing Length of First Names and Last Names

# Correlation
cor(FinalData$FirstNameLength, FinalData$LastNameLength, method = "pearson")

## [1] 0.06410693

# Linear Regression
model2 <- glm(LastNameLength ~ FirstNameLength, data = FinalData)
summary(model2)

## 
## Call:
## glm(formula = LastNameLength ~ FirstNameLength, data = FinalData)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      5.19742    1.70702   3.045  0.00697 **
## FirstNameLength  0.08155    0.29920   0.273  0.78831   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3.128755)
## 
##     Null deviance: 56.550  on 19  degrees of freedom
## Residual deviance: 56.318  on 18  degrees of freedom
## AIC: 83.463
## 
## Number of Fisher Scoring iterations: 2

Visualizing this Relationship

# Density of predictor variable (IV)
FinalData %>%
  ggplot(aes(x = FirstNameLength)) +
  geom_density(fill = "#BDD7E7") + theme_minimal() +
  labs(title = "Length of First Names of Summer 2024 Interns",
         x = "Length of First Names")

# Linear relationship of IV and DV; note that we use geom_jitter instead of geom_point to see multiple points that have the same (x,y) coordinates. For example, there are multiple individuals with 6 character first names and 4 character last names, and we want to be able to see them all and not just one point at (6,4)
FinalData %>%
  ggplot(aes(x = FirstNameLength, y = LastNameLength)) +
  geom_jitter(height = 0.1, width = 0.1) + 
  geom_smooth(method = "lm")+
  labs(title = "Last Name Length by First Name Length",
       subtitle = "\U03B2 ≈ 0",
       x = "First Name Length",
       y = "Last Name Length") +
  theme_minimal()

Height by Region

# Table
height.by.region <- FinalData %>%
  group_by(Region) %>%
  summarise(Mean.Height = mean(HeightCMs),
            N = n()) %>%
  arrange(desc(Mean.Height))

knitr::kable(height.by.region)

Region	Mean.Height	N
Midwest	171.8733	3
International	170.8150	3
Northeast	167.2771	7
Southeast	165.1000	1
West	161.2900	2
Southwest	158.7500	4

# Linear Model (ANOVA)
## Reordering the levels in order of descending mean
FinalData$Region <- factor(FinalData$Region, levels = height.by.region$Region)

model3 <- glm(HeightCMs ~ Region, data = FinalData)
summary(model3)

## 
## Call:
## glm(formula = HeightCMs ~ Region, data = FinalData)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          171.873      4.585  37.482 1.92e-15 ***
## RegionInternational   -1.058      6.485  -0.163   0.8727    
## RegionNortheast       -4.596      5.481  -0.839   0.4158    
## RegionSoutheast       -6.773      9.171  -0.739   0.4724    
## RegionWest           -10.583      7.250  -1.460   0.1664    
## RegionSouthwest      -13.123      6.066  -2.163   0.0483 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 63.07914)
## 
##     Null deviance: 1322.85  on 19  degrees of freedom
## Residual deviance:  883.11  on 14  degrees of freedom
## AIC: 146.51
## 
## Number of Fisher Scoring iterations: 2

Visualizing this Relationship

FinalData %>%
  ggplot(aes(x = Region, y = HeightCMs, fill = Region)) +
  scale_fill_brewer(palette = "Spectral") +
  geom_boxplot() +
  labs(title = "Height in CMs by Region",
       x = "Region",
       y = "Height in Cms") +
  theme_minimal() +
  theme(legend.position = "none")

2024 R Workshop

Muji Chughtai

2025-07-07

Base Functions in R

Load Required Packages and Data

Data Cleaning

Remove Unnecessary Rows

Remove Unnecessary Columns

Calculating Length of Names

Simplifying `Region`

Simplifying `Birthday`

Calculating Age in Days and Height in CMs

Data Anlysis and Visualization

Comparing Height in CM and Height in Inches

Visualizing this Relationship

Comparing Age in Days and Age in Years

Visualizing this Relationship

Comparing Length of First Names and Last Names

Visualizing this Relationship

Height by Region

Visualizing this Relationship

2024 R Workshop

Muji Chughtai

2025-07-07

Base Functions in R

Load Required Packages and Data

Data Cleaning

Remove Unnecessary Rows

Remove Unnecessary Columns

Calculating Length of Names

Simplifying Region

Simplifying Birthday

Calculating Age in Days and Height in CMs

Data Anlysis and Visualization

Comparing Height in CM and Height in Inches

Visualizing this Relationship

Comparing Age in Days and Age in Years

Visualizing this Relationship

Comparing Length of First Names and Last Names

Visualizing this Relationship

Height by Region

Visualizing this Relationship

Simplifying `Region`

Simplifying `Birthday`