Year of Birth: A numerical variable indicating the year a child was born. Gender: A categorical variable specifying the child’s gender/ Ethnicity: A categorical variable representing the child’s ethnicity. Child’s First Name: A categorical variable with unique values for each distinct name. Count: A numerical variable indicating the frequency of a given name. Rank: A numerical variable representing the popularity ranking of a name.
Is there a relationship between Year of Birth and Name Popularity?
Are there Gender Differences in Name Popularity?
Are there Ethnicity Differences in Name Popularity?
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(pwr)
## Warning: package 'pwr' was built under R version 4.4.1
baby_names <- read.csv("Popular_Baby_Names.csv")
numerical_vars <- c("Year.of.Birth", "Count", "Rank")
summary_stats <- baby_names %>%
select(all_of(numerical_vars)) %>%
summary()
print(summary_stats)
## Year.of.Birth Count Rank
## Min. :2011 Min. : 10.00 Min. : 1.00
## 1st Qu.:2012 1st Qu.: 13.00 1st Qu.: 37.00
## Median :2013 Median : 20.00 Median : 58.00
## Mean :2014 Mean : 33.78 Mean : 56.75
## 3rd Qu.:2014 3rd Qu.: 36.00 3rd Qu.: 78.00
## Max. :2021 Max. :446.00 Max. :102.00
categorical_vars <- c("Gender", "Ethnicity")
frequency_distributions <- baby_names %>%
group_by(across(all_of(categorical_vars))) %>%
count() %>%
ungroup()
print(frequency_distributions)
## # A tibble: 14 × 3
## Gender Ethnicity n
## <chr> <chr> <int>
## 1 FEMALE ASIAN AND PACI 1245
## 2 FEMALE ASIAN AND PACIFIC ISLANDER 4656
## 3 FEMALE BLACK NON HISP 1185
## 4 FEMALE BLACK NON HISPANIC 4969
## 5 FEMALE HISPANIC 10412
## 6 FEMALE WHITE NON HISP 2567
## 7 FEMALE WHITE NON HISPANIC 10265
## 8 MALE ASIAN AND PACI 1238
## 9 MALE ASIAN AND PACIFIC ISLANDER 4727
## 10 MALE BLACK NON HISP 1261
## 11 MALE BLACK NON HISPANIC 5083
## 12 MALE HISPANIC 9953
## 13 MALE WHITE NON HISP 2276
## 14 MALE WHITE NON HISPANIC 9377
missing_values <- baby_names %>%
summarise(across(everything()) %>%
is.na() %>%
sum())
print(missing_values)
## across(everything()) %>% is.na() %>% sum()
## 1 0
iqr <- IQR(baby_names$Count)
upper_limit <- quantile(baby_names$Count, 0.75) + 1.5 * iqr
lower_limit <- quantile(baby_names$Count, 0.25) - 1.5 * iqr
clean_data <- baby_names %>%
filter(Count >= lower_limit & Count <= upper_limit)
head(clean_data)
## Year.of.Birth Gender Ethnicity Child.s.First.Name Count Rank
## 1 2011 FEMALE HISPANIC GERALDINE 13 75
## 2 2011 FEMALE HISPANIC GIA 21 67
## 3 2011 FEMALE HISPANIC GIANNA 49 42
## 4 2011 FEMALE HISPANIC GISELLE 38 51
## 5 2011 FEMALE HISPANIC GRACE 36 53
## 6 2011 FEMALE HISPANIC GUADALUPE 26 62
sample_baby_names <- clean_data %>%
sample_n(6921, replace = FALSE)
head(sample_baby_names)
## Year.of.Birth Gender Ethnicity Child.s.First.Name Count Rank
## 1 2015 MALE ASIAN AND PACIFIC ISLANDER Ali 13 52
## 2 2013 FEMALE WHITE NON HISPANIC Harper 48 45
## 3 2014 FEMALE ASIAN AND PACIFIC ISLANDER Nicole 28 22
## 4 2014 FEMALE BLACK NON HISPANIC Eliana 23 28
## 5 2011 MALE WHITE NON HISPANIC NATHANIEL 46 62
## 6 2013 MALE BLACK NON HISPANIC Richard 19 53
year_stats <- baby_names %>%
summarise(
min_year = min(Year.of.Birth),
max_year = max(Year.of.Birth),
range_year = max_year - min_year,
mean_year = mean(Year.of.Birth),
median_year = median(Year.of.Birth),
mode_year = mode(Year.of.Birth),
sd_year = sd(Year.of.Birth),
variance_year = var(Year.of.Birth)
)
count_stats <- baby_names %>%
summarise(
min_count = min(Count),
max_count = max(Count),
range_count = max_count - min_count,
mean_count = mean(Count),
median_count = median(Count),
mode_count = mode(Count),
sd_count = sd(Count),
variance_count = var(Count)
)
rank_stats <- baby_names %>%
summarise(
min_rank = min(Rank),
max_rank = max(Rank),
range_rank = max_rank - min_rank,
mean_rank = mean(Rank),
median_rank = median(Rank),
mode_rank = mode(Rank),
sd_rank = sd(Rank),
variance_rank = var(Rank)
)
gender_stats <- baby_names %>%
count(Gender) %>%
mutate(proportion = n / sum(n))
ethnicity_stats <- baby_names %>%
count(Ethnicity) %>%
mutate(proportion = n / sum(n))
name_stats <- baby_names %>%
group_by(Child.s.First.Name) %>%
summarise(count = n()) %>%
arrange(desc(count))
most_frequent_names <- head(name_stats, 10)
print(paste("The variance of Year of Birth is:", year_stats$variance_year))
## [1] "The variance of Year of Birth is: 6.34354052556836"
print(paste("The variance of Count is:", count_stats$variance_count))
## [1] "The variance of Count is: 1507.45362417075"
print(paste("The variance of Rank is:", rank_stats$variance_rank))
## [1] "The variance of Rank is: 650.238603457949"
print(paste("The standard deviation of Year of Birth is:", year_stats$sd_year))
## [1] "The standard deviation of Year of Birth is: 2.518638625442"
print(paste("The standard deviation of Count is:", count_stats$sd_count))
## [1] "The standard deviation of Count is: 38.8259400938439"
print(paste("The standard deviation of Rank is:", rank_stats$sd_rank))
## [1] "The standard deviation of Rank is: 25.499776537412"
calculate_quantiles <- function(x) {
data.frame(
percentile_10 = quantile(x, 0.1),
percentile_25 = quantile(x, 0.25),
percentile_50 = quantile(x, 0.5),
percentile_75 = quantile(x, 0.75),
percentile_90 = quantile(x, 0.9),
quartile_1 = quantile(x, 0.25),
quartile_3 = quantile(x, 0.75),
iqr = diff(quantile(x, c(0.25, 0.75)))
)
}
baby_names %>%
summarise(
year_quantiles = list(calculate_quantiles(Year.of.Birth)),
count_quantiles = list(calculate_quantiles(Count)),
rank_quantiles = list(calculate_quantiles(Rank))
) %>%
unnest(cols = c(year_quantiles, count_quantiles, rank_quantiles), names_sep = "_") %>%
print()
## # A tibble: 1 × 24
## year_quantiles_percentile_10 year_quantiles_percentil…¹ year_quantiles_perce…²
## <dbl> <dbl> <dbl>
## 1 2011 2012 2013
## # ℹ abbreviated names: ¹year_quantiles_percentile_25,
## # ²year_quantiles_percentile_50
## # ℹ 21 more variables: year_quantiles_percentile_75 <dbl>,
## # year_quantiles_percentile_90 <dbl>, year_quantiles_quartile_1 <dbl>,
## # year_quantiles_quartile_3 <dbl>, year_quantiles_iqr <dbl>,
## # count_quantiles_percentile_10 <dbl>, count_quantiles_percentile_25 <dbl>,
## # count_quantiles_percentile_50 <dbl>, count_quantiles_percentile_75 <dbl>, …
Box plots
ggplot(baby_names, aes(x = 1, y = Year.of.Birth)) +
geom_boxplot() +
labs(title = "Boxplot of Year of Birth")
ggplot(baby_names, aes(x = 1, y = Count)) +
geom_boxplot() +
labs(title = "Boxplot of Count")
ggplot(baby_names, aes(x = 1, y = Rank)) +
geom_boxplot() +
labs(title = "Boxplot of Rank")
a1) Distribution of Year of Birth
ggplot(baby_names, aes(x = Year.of.Birth)) +
geom_histogram(binwidth = 1) +
labs(title = "Histogram of Year of Birth")
a2) Distribution of Count
ggplot(baby_names, aes(x = Count)) +
geom_histogram(binwidth = 5) +
labs(title = "Histogram of Count")
a3) Distribution of Rank
ggplot(baby_names, aes(x = Rank)) +
geom_histogram(binwidth = 5) +
labs(title = "Histogram of Rank")
b) Skewness
print("Distribution of Year of Birth is skewed to the right")
## [1] "Distribution of Year of Birth is skewed to the right"
print("Distribution of Distribution of Count is J-shaped")
## [1] "Distribution of Distribution of Count is J-shaped"
print("Distribution of Rank is multimodal ")
## [1] "Distribution of Rank is multimodal "
ggplot(baby_names, aes(x = Count, y = Rank)) +
geom_point() +
labs(title = "Scatter Plot of Count vs. Rank")
ggplot(baby_names, aes(x = Year.of.Birth, y = Count)) +
geom_point() +
labs(title = "Scatter Plot of Year of Birth vs. Count")
ggplot(baby_names, aes(x = Gender, y = Count)) +
geom_boxplot() +
labs(title = "Box Plot of Count by Gender")
ggplot(baby_names, aes(x = Ethnicity, y = Count)) +
geom_bar(stat = "summary", fun = "mean") +
labs(title = "Bar Plot of Average Count by Ethnicity")
ggplot(baby_names, aes(x = Year.of.Birth)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(title = "Density Plot of Year of Birth")
ggplot(baby_names, aes(x = Count)) +
geom_density(fill = "green", alpha = 0.5) +
labs(title = "Density Plot of Count")
ggplot(baby_names, aes(x = Gender)) +
geom_bar(fill = "orange") +
labs(title = "Bar Plot of Gender")
ggplot(baby_names, aes(x = Ethnicity)) +
geom_bar(fill = "purple") +
labs(title = "Bar Plot of Ethnicity")
confidence_level <- 0.95
confidence_intervals <- baby_names %>%
group_by(Gender) %>%
summarize(
mean_count = mean(Count),
sd_count = sd(Count),
n = n(),
lower_ci = mean_count - qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n),
upper_ci = mean_count + qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n)
)
print(confidence_intervals)
## # A tibble: 2 × 6
## Gender mean_count sd_count n lower_ci upper_ci
## <chr> <dbl> <dbl> <int> <dbl> <dbl>
## 1 FEMALE 29.1 31.2 35299 29.0 29.1
## 2 MALE 38.7 44.9 33915 38.7 38.7
cohen_d <- baby_names %>%
group_by(Gender) %>%
summarize(
mean_count = mean(Count),
sd_count = sd(Count),
n = n()
) %>%
mutate(
pooled_sd = sqrt(((n[1] - 1) * sd_count[1]^2 + (n[2] - 1) * sd_count[2]^2) / (n[1] + n[2] - 2)),
cohen_d = (mean_count[1] - mean_count[2]) / pooled_sd
)
print(cohen_d$cohen_d)
## [1] -0.2503134 -0.2503134
power_analysis <- pwr.t.test(n = NULL, d = 0.5, sig.level = 0.05, power = 0.8)
print(power_analysis)
##
## Two-sample t test power calculation
##
## n = 63.76561
## d = 0.5
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: n is number in *each* group
a1) Null: There is no significant difference in the popularity of names across different birth years. Alternative: The popularity of names has changed significantly over time. Tailed test: Two-tailed
a2) Null: There is no significant difference in the popularity of names between males and females. Alternative: The popularity of names varies significantly between males and females. Tailed test: Two-tailed
a3) Null: There is no significant difference in the popularity of names across different ethnicities. Alternative: The popularity of names varies significantly across different ethnicities. Tailed test: Two-tailed
b1) Hypothesis 1
model <- baby_names %>%
lm(Count ~ Year.of.Birth, data = .)
p_value <- summary(model)$coefficients[2, 4]
print(p_value)
## [1] 5.516385e-15
b2) Hypothesis 2
t_test <- baby_names %>%
t.test(Count ~ Gender, data = .)
print(t_test)
##
## Welch Two Sample t-test
##
## data: Count by Gender
## t = -32.692, df = 60239, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FEMALE and group MALE is not equal to 0
## 95 percent confidence interval:
## -10.221682 -9.065346
## sample estimates:
## mean in group FEMALE mean in group MALE
## 29.05402 38.69754
b3) Hypothesis 3
anova <- baby_names %>%
aov(Count ~ Ethnicity, data = .)
print(summary(anova))
## Df Sum Sq Mean Sq F value Pr(>F)
## Ethnicity 6 1969743 328290 221.9 <2e-16 ***
## Residuals 69207 102365645 1479
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
c1) Reject the null hypothesis and accept the alternative c2) Reject the null hypothesis and accept the alternative c3) Reject the null hypothesis and accept the alternative