Base Functions in R

17 + 18
## [1] 35
5 * 4
## [1] 20
100/15
## [1] 6.666667
4^3
## [1] 64
# Statistics (percentage below a z-score of -1)
pnorm(q = -1, mean = 0, sd = 1, lower.tail = TRUE)
## [1] 0.1586553
## Percentage below a z-score of -3, -2, -1, 0, 1, 2, and 3
pnorm(q = c(-3,-2,-1,0,1,2,3), mean = 0, sd = 1, lower.tail = TRUE)
## [1] 0.001349898 0.022750132 0.158655254 0.500000000 0.841344746 0.977249868
## [7] 0.998650102
# Using variables
a <- 3
b <- 4
c <- sqrt(a^2 + b^2); c
## [1] 5
a <- c(3, 6, 9, 12, 15)
b <- c(4, 8, 12, 16, 20)
perfect.hyps <- sqrt(a^2 + b^2)
perfect.hyps
## [1]  5 10 15 20 25
# Using variables in R's base functions to compare scores on different standardized tests

## ACT
pop.mu = 20.8
pop.sd = 5.8

my.score <- 30

pnorm(q = my.score, mean = pop.mu, sd = pop.sd)
## [1] 0.9436538
### Note that you do not need to include the parameter names in the function call
pnorm(my.score, pop.mu, pop.sd)
## [1] 0.9436538
## GRE
pop.mu <- 150
pop.sd <- 8.6

my.scores <- c(158, 160, 165)

pnorm(my.scores, pop.mu, pop.sd)
## [1] 0.8238747 0.8775428 0.9594367

Load Required Packages and Data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
RawData <- read.csv("RWorkshop.csv")

Data Cleaning

Remove Unnecessary Rows

# Removing rows 1 through 2 - Note the comma after the parentheses
CleanerData <- RawData[-c(1:2),]

Remove Unnecessary Columns

# Retaining only the columns that we need
CleanData <- CleanerData %>%
  select(FirstName, LastName, Region, Birthday.1_1, Birthday.2_1, Birthday.3_1, AgeYears, HeightInches)

Calculating Length of Names

# Calculating the number of characters (letters) in each name
Name.Data <- CleanData %>%
  mutate(FirstNameLength = nchar(FirstName),
         LastNameLength = nchar(LastName))

Simplifying Region

# Removing states from the region values
Region.Data <- Name.Data %>%
  mutate(Region = case_when(startsWith(Region, "Northeast") ~ "Northeast",
                            startsWith(Region, "Midwest") ~ "Midwest",
                            startsWith(Region, "Southeast") ~ "Southeast",
                            startsWith(Region, "Southwest") ~ "Southwest",
                            startsWith(Region, "West") ~ "West",
                            startsWith(Region, "International") ~ "International"))

Simplifying Birthday

# Combining the three birthday variables into a single one and converting to R date format; removing the original (now unnecessary) birthday variables
Bday.Data <- Region.Data %>%
  mutate(Birthday = paste0(Birthday.1_1, " ", Birthday.2_1, ", ", Birthday.3_1)) %>%
  mutate(Birthday = as.Date(Birthday, format = "%B %d, %Y")
         ) %>%
  select(-Birthday.1_1, -Birthday.2_1, -Birthday.3_1)

Calculating Age in Days and Height in CMs

# Subtracting current date from birthday to calculate age in days as a numeric variable; ensuring height is numeric and converting height from feet to inches for individuals who accidentally reported their height as feet (since it is less than 7); converting height in inches to height in CMs; removing individuals who have an NA for height in cms or age in days (our main DVs)
FinalData <- Bday.Data %>%
  mutate(AgeDays = as.numeric(difftime(Sys.Date(), Birthday, units = "days")),
         HeightInches = as.numeric(HeightInches),
         HeightInches = ifelse(HeightInches < 7, HeightInches*12, HeightInches),
         AgeYears = as.numeric(AgeYears),
         HeightCMs = HeightInches*2.54) %>%
  filter(!is.na(HeightCMs) | !is.na(AgeYears))

Data Anlysis and Visualization

Comparing Height in CM and Height in Inches

# Correlation
cor(FinalData$HeightCMs, FinalData$HeightInches, method = "pearson")
## [1] 1
# Linear Regression
model1 <- glm(HeightCMs ~ HeightInches, data = FinalData)
summary(model1)
## 
## Call:
## glm(formula = HeightCMs ~ HeightInches, data = FinalData)
## 
## Coefficients:
##               Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)  7.626e-14  1.299e-13 5.87e-01    0.565    
## HeightInches 2.540e+00  1.985e-15 1.28e+15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 8.077936e-28)
## 
##     Null deviance: 1.3229e+03  on 19  degrees of freedom
## Residual deviance: 1.4540e-26  on 18  degrees of freedom
## AIC: -1187
## 
## Number of Fisher Scoring iterations: 1

Visualizing this Relationship

# Density of predictor variable (IV)
FinalData %>%
  ggplot(aes(x = HeightInches)) +
  geom_density(fill = "#BDD7E7") + theme_minimal() +
  labs(title = "Height in Inches of Summer 2024 Interns",
         x = "Height in Inches")

# Linear relationship of IV and DV
FinalData %>%
  ggplot(aes(x = HeightInches, y = HeightCMs)) +
  geom_point() +
  geom_smooth(method = "lm")+
  labs(title = "The Perfect Linear Relationship between Height in CMs and Height in Inches",
       subtitle = "\U03B2 = 2.54",
       x = "Height in Inches",
       y = "Height in CMs") +
  theme_minimal()

Comparing Age in Days and Age in Years

# Correlation
cor(FinalData$AgeDays, FinalData$AgeYears, method = "pearson")
## [1] 0.9852028
# Linear Regression
model2 <- glm(AgeDays ~ AgeYears, data = FinalData)
summary(model2)
## 
## Call:
## glm(formula = AgeDays ~ AgeYears, data = FinalData)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   635.61     319.61   1.989   0.0622 .  
## AgeYears      360.66      14.79  24.388 3.06e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 11580.59)
## 
##     Null deviance: 7096066  on 19  degrees of freedom
## Residual deviance:  208451  on 18  degrees of freedom
## AIC: 247.79
## 
## Number of Fisher Scoring iterations: 2

Visualizing this Relationship

# Density of predictor variable (IV)
FinalData %>%
  ggplot(aes(x = AgeYears)) +
  geom_density(fill = "#BDD7E7") + theme_minimal() +
  labs(title = "Age in Years of Summer 2024 Interns",
         x = "Age in Years")

# Linear relationship of IV and DV
FinalData %>%
  ggplot(aes(x = AgeYears, y = AgeDays)) +
  geom_point() +
  geom_smooth(method = "lm")+
  labs(title = "Age in Years by Age in days",
       subtitle = "\U03B2 ≈ 365",
       x = "Age in Years",
       y = "Age in Days") +
  theme_minimal()

Comparing Length of First Names and Last Names

# Correlation
cor(FinalData$FirstNameLength, FinalData$LastNameLength, method = "pearson")
## [1] 0.06410693
# Linear Regression
model2 <- glm(LastNameLength ~ FirstNameLength, data = FinalData)
summary(model2)
## 
## Call:
## glm(formula = LastNameLength ~ FirstNameLength, data = FinalData)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      5.19742    1.70702   3.045  0.00697 **
## FirstNameLength  0.08155    0.29920   0.273  0.78831   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3.128755)
## 
##     Null deviance: 56.550  on 19  degrees of freedom
## Residual deviance: 56.318  on 18  degrees of freedom
## AIC: 83.463
## 
## Number of Fisher Scoring iterations: 2

Visualizing this Relationship

# Density of predictor variable (IV)
FinalData %>%
  ggplot(aes(x = FirstNameLength)) +
  geom_density(fill = "#BDD7E7") + theme_minimal() +
  labs(title = "Length of First Names of Summer 2024 Interns",
         x = "Length of First Names")

# Linear relationship of IV and DV; note that we use geom_jitter instead of geom_point to see multiple points that have the same (x,y) coordinates. For example, there are multiple individuals with 6 character first names and 4 character last names, and we want to be able to see them all and not just one point at (6,4)
FinalData %>%
  ggplot(aes(x = FirstNameLength, y = LastNameLength)) +
  geom_jitter(height = 0.1, width = 0.1) + 
  geom_smooth(method = "lm")+
  labs(title = "Last Name Length by First Name Length",
       subtitle = "\U03B2 ≈ 0",
       x = "First Name Length",
       y = "Last Name Length") +
  theme_minimal()

Height by Region

# Table
height.by.region <- FinalData %>%
  group_by(Region) %>%
  summarise(Mean.Height = mean(HeightCMs),
            N = n()) %>%
  arrange(desc(Mean.Height))

knitr::kable(height.by.region)
Region Mean.Height N
Midwest 171.8733 3
International 170.8150 3
Northeast 167.2771 7
Southeast 165.1000 1
West 161.2900 2
Southwest 158.7500 4
# Linear Model (ANOVA)
## Reordering the levels in order of descending mean
FinalData$Region <- factor(FinalData$Region, levels = height.by.region$Region)

model3 <- glm(HeightCMs ~ Region, data = FinalData)
summary(model3)
## 
## Call:
## glm(formula = HeightCMs ~ Region, data = FinalData)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          171.873      4.585  37.482 1.92e-15 ***
## RegionInternational   -1.058      6.485  -0.163   0.8727    
## RegionNortheast       -4.596      5.481  -0.839   0.4158    
## RegionSoutheast       -6.773      9.171  -0.739   0.4724    
## RegionWest           -10.583      7.250  -1.460   0.1664    
## RegionSouthwest      -13.123      6.066  -2.163   0.0483 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 63.07914)
## 
##     Null deviance: 1322.85  on 19  degrees of freedom
## Residual deviance:  883.11  on 14  degrees of freedom
## AIC: 146.51
## 
## Number of Fisher Scoring iterations: 2

Visualizing this Relationship

FinalData %>%
  ggplot(aes(x = Region, y = HeightCMs, fill = Region)) +
  scale_fill_brewer(palette = "Spectral") +
  geom_boxplot() +
  labs(title = "Height in CMs by Region",
       x = "Region",
       y = "Height in Cms") +
  theme_minimal() +
  theme(legend.position = "none")