Base Functions in R
17 + 18
## [1] 35
5 * 4
## [1] 20
100/15
## [1] 6.666667
4^3
## [1] 64
# Statistics (percentage below a z-score of -1)
pnorm(q = -1, mean = 0, sd = 1, lower.tail = TRUE)
## [1] 0.1586553
## Percentage below a z-score of -3, -2, -1, 0, 1, 2, and 3
pnorm(q = c(-3,-2,-1,0,1,2,3), mean = 0, sd = 1, lower.tail = TRUE)
## [1] 0.001349898 0.022750132 0.158655254 0.500000000 0.841344746 0.977249868
## [7] 0.998650102
# Using variables
a <- 3
b <- 4
c <- sqrt(a^2 + b^2); c
## [1] 5
a <- c(3, 6, 9, 12, 15)
b <- c(4, 8, 12, 16, 20)
perfect.hyps <- sqrt(a^2 + b^2)
perfect.hyps
## [1] 5 10 15 20 25
# Using variables in R's base functions to compare scores on different standardized tests
## ACT
pop.mu = 20.8
pop.sd = 5.8
my.score <- 30
pnorm(q = my.score, mean = pop.mu, sd = pop.sd)
## [1] 0.9436538
### Note that you do not need to include the parameter names in the function call
pnorm(my.score, pop.mu, pop.sd)
## [1] 0.9436538
## GRE
pop.mu <- 150
pop.sd <- 8.6
my.scores <- c(158, 160, 165)
pnorm(my.scores, pop.mu, pop.sd)
## [1] 0.8238747 0.8775428 0.9594367
Data Cleaning
Remove Unnecessary Rows
# Removing rows 1 through 2 - Note the comma after the parentheses
CleanerData <- RawData[-c(1:2),]
Remove Unnecessary Columns
# Retaining only the columns that we need
CleanData <- CleanerData %>%
select(FirstName, LastName, Region, Birthday.1_1, Birthday.2_1, Birthday.3_1, AgeYears, HeightInches)
Calculating Length of Names
# Calculating the number of characters (letters) in each name
Name.Data <- CleanData %>%
mutate(FirstNameLength = nchar(FirstName),
LastNameLength = nchar(LastName))
Simplifying Region
# Removing states from the region values
Region.Data <- Name.Data %>%
mutate(Region = case_when(startsWith(Region, "Northeast") ~ "Northeast",
startsWith(Region, "Midwest") ~ "Midwest",
startsWith(Region, "Southeast") ~ "Southeast",
startsWith(Region, "Southwest") ~ "Southwest",
startsWith(Region, "West") ~ "West",
startsWith(Region, "International") ~ "International"))
Simplifying Birthday
# Combining the three birthday variables into a single one and converting to R date format; removing the original (now unnecessary) birthday variables
Bday.Data <- Region.Data %>%
mutate(Birthday = paste0(Birthday.1_1, " ", Birthday.2_1, ", ", Birthday.3_1)) %>%
mutate(Birthday = as.Date(Birthday, format = "%B %d, %Y")
) %>%
select(-Birthday.1_1, -Birthday.2_1, -Birthday.3_1)
Calculating Age in Days and Height in CMs
# Subtracting current date from birthday to calculate age in days as a numeric variable; ensuring height is numeric and converting height from feet to inches for individuals who accidentally reported their height as feet (since it is less than 7); converting height in inches to height in CMs; removing individuals who have an NA for height in cms or age in days (our main DVs)
FinalData <- Bday.Data %>%
mutate(AgeDays = as.numeric(difftime(Sys.Date(), Birthday, units = "days")),
HeightInches = as.numeric(HeightInches),
HeightInches = ifelse(HeightInches < 7, HeightInches*12, HeightInches),
AgeYears = as.numeric(AgeYears),
HeightCMs = HeightInches*2.54) %>%
filter(!is.na(HeightCMs) | !is.na(AgeYears))
Data Anlysis and Visualization
Comparing Height in CM and Height in Inches
# Correlation
cor(FinalData$HeightCMs, FinalData$HeightInches, method = "pearson")
## [1] 1
# Linear Regression
model1 <- glm(HeightCMs ~ HeightInches, data = FinalData)
summary(model1)
##
## Call:
## glm(formula = HeightCMs ~ HeightInches, data = FinalData)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.626e-14 1.299e-13 5.87e-01 0.565
## HeightInches 2.540e+00 1.985e-15 1.28e+15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 8.077936e-28)
##
## Null deviance: 1.3229e+03 on 19 degrees of freedom
## Residual deviance: 1.4540e-26 on 18 degrees of freedom
## AIC: -1187
##
## Number of Fisher Scoring iterations: 1
Visualizing this Relationship
# Density of predictor variable (IV)
FinalData %>%
ggplot(aes(x = HeightInches)) +
geom_density(fill = "#BDD7E7") + theme_minimal() +
labs(title = "Height in Inches of Summer 2024 Interns",
x = "Height in Inches")

# Linear relationship of IV and DV
FinalData %>%
ggplot(aes(x = HeightInches, y = HeightCMs)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "The Perfect Linear Relationship between Height in CMs and Height in Inches",
subtitle = "\U03B2 = 2.54",
x = "Height in Inches",
y = "Height in CMs") +
theme_minimal()

Comparing Age in Days and Age in Years
# Correlation
cor(FinalData$AgeDays, FinalData$AgeYears, method = "pearson")
## [1] 0.9852028
# Linear Regression
model2 <- glm(AgeDays ~ AgeYears, data = FinalData)
summary(model2)
##
## Call:
## glm(formula = AgeDays ~ AgeYears, data = FinalData)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 635.61 319.61 1.989 0.0622 .
## AgeYears 360.66 14.79 24.388 3.06e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 11580.59)
##
## Null deviance: 7096066 on 19 degrees of freedom
## Residual deviance: 208451 on 18 degrees of freedom
## AIC: 247.79
##
## Number of Fisher Scoring iterations: 2
Visualizing this Relationship
# Density of predictor variable (IV)
FinalData %>%
ggplot(aes(x = AgeYears)) +
geom_density(fill = "#BDD7E7") + theme_minimal() +
labs(title = "Age in Years of Summer 2024 Interns",
x = "Age in Years")

# Linear relationship of IV and DV
FinalData %>%
ggplot(aes(x = AgeYears, y = AgeDays)) +
geom_point() +
geom_smooth(method = "lm")+
labs(title = "Age in Years by Age in days",
subtitle = "\U03B2 ≈ 365",
x = "Age in Years",
y = "Age in Days") +
theme_minimal()

Comparing Length of First Names and Last Names
# Correlation
cor(FinalData$FirstNameLength, FinalData$LastNameLength, method = "pearson")
## [1] 0.06410693
# Linear Regression
model2 <- glm(LastNameLength ~ FirstNameLength, data = FinalData)
summary(model2)
##
## Call:
## glm(formula = LastNameLength ~ FirstNameLength, data = FinalData)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.19742 1.70702 3.045 0.00697 **
## FirstNameLength 0.08155 0.29920 0.273 0.78831
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3.128755)
##
## Null deviance: 56.550 on 19 degrees of freedom
## Residual deviance: 56.318 on 18 degrees of freedom
## AIC: 83.463
##
## Number of Fisher Scoring iterations: 2
Visualizing this Relationship
# Density of predictor variable (IV)
FinalData %>%
ggplot(aes(x = FirstNameLength)) +
geom_density(fill = "#BDD7E7") + theme_minimal() +
labs(title = "Length of First Names of Summer 2024 Interns",
x = "Length of First Names")

# Linear relationship of IV and DV; note that we use geom_jitter instead of geom_point to see multiple points that have the same (x,y) coordinates. For example, there are multiple individuals with 6 character first names and 4 character last names, and we want to be able to see them all and not just one point at (6,4)
FinalData %>%
ggplot(aes(x = FirstNameLength, y = LastNameLength)) +
geom_jitter(height = 0.1, width = 0.1) +
geom_smooth(method = "lm")+
labs(title = "Last Name Length by First Name Length",
subtitle = "\U03B2 ≈ 0",
x = "First Name Length",
y = "Last Name Length") +
theme_minimal()

Height by Region
# Table
height.by.region <- FinalData %>%
group_by(Region) %>%
summarise(Mean.Height = mean(HeightCMs),
N = n()) %>%
arrange(desc(Mean.Height))
knitr::kable(height.by.region)
| Midwest |
171.8733 |
3 |
| International |
170.8150 |
3 |
| Northeast |
167.2771 |
7 |
| Southeast |
165.1000 |
1 |
| West |
161.2900 |
2 |
| Southwest |
158.7500 |
4 |
# Linear Model (ANOVA)
## Reordering the levels in order of descending mean
FinalData$Region <- factor(FinalData$Region, levels = height.by.region$Region)
model3 <- glm(HeightCMs ~ Region, data = FinalData)
summary(model3)
##
## Call:
## glm(formula = HeightCMs ~ Region, data = FinalData)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 171.873 4.585 37.482 1.92e-15 ***
## RegionInternational -1.058 6.485 -0.163 0.8727
## RegionNortheast -4.596 5.481 -0.839 0.4158
## RegionSoutheast -6.773 9.171 -0.739 0.4724
## RegionWest -10.583 7.250 -1.460 0.1664
## RegionSouthwest -13.123 6.066 -2.163 0.0483 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 63.07914)
##
## Null deviance: 1322.85 on 19 degrees of freedom
## Residual deviance: 883.11 on 14 degrees of freedom
## AIC: 146.51
##
## Number of Fisher Scoring iterations: 2
Visualizing this Relationship
FinalData %>%
ggplot(aes(x = Region, y = HeightCMs, fill = Region)) +
scale_fill_brewer(palette = "Spectral") +
geom_boxplot() +
labs(title = "Height in CMs by Region",
x = "Region",
y = "Height in Cms") +
theme_minimal() +
theme(legend.position = "none")
