About the Dataset.

Year of Birth: A numerical variable indicating the year a child was born. Gender: A categorical variable specifying the child’s gender/ Ethnicity: A categorical variable representing the child’s ethnicity. Child’s First Name: A categorical variable with unique values for each distinct name. Count: A numerical variable indicating the frequency of a given name. Rank: A numerical variable representing the popularity ranking of a name.

  1. State the objective or questions for the hypothesis testing and Analysis
  1. Is there a relationship between Year of Birth and Name Popularity?

  2. Are there Gender Differences in Name Popularity?

  3. Are there Ethnicity Differences in Name Popularity?

  1. Load and install libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(pwr)
## Warning: package 'pwr' was built under R version 4.4.1
  1. Load the dataset
baby_names <- read.csv("Popular_Baby_Names.csv")
  1. Summarize the dataset
  1. Summary of Numerical Variables
numerical_vars <- c("Year.of.Birth", "Count", "Rank")

summary_stats <- baby_names %>%
  select(all_of(numerical_vars)) %>%
  summary()

print(summary_stats)
##  Year.of.Birth      Count             Rank       
##  Min.   :2011   Min.   : 10.00   Min.   :  1.00  
##  1st Qu.:2012   1st Qu.: 13.00   1st Qu.: 37.00  
##  Median :2013   Median : 20.00   Median : 58.00  
##  Mean   :2014   Mean   : 33.78   Mean   : 56.75  
##  3rd Qu.:2014   3rd Qu.: 36.00   3rd Qu.: 78.00  
##  Max.   :2021   Max.   :446.00   Max.   :102.00
  1. Frequency distribution for Categorical Variables
categorical_vars <- c("Gender", "Ethnicity")

frequency_distributions <- baby_names %>%
  group_by(across(all_of(categorical_vars))) %>%
  count() %>%
  ungroup()

print(frequency_distributions)
## # A tibble: 14 × 3
##    Gender Ethnicity                      n
##    <chr>  <chr>                      <int>
##  1 FEMALE ASIAN AND PACI              1245
##  2 FEMALE ASIAN AND PACIFIC ISLANDER  4656
##  3 FEMALE BLACK NON HISP              1185
##  4 FEMALE BLACK NON HISPANIC          4969
##  5 FEMALE HISPANIC                   10412
##  6 FEMALE WHITE NON HISP              2567
##  7 FEMALE WHITE NON HISPANIC         10265
##  8 MALE   ASIAN AND PACI              1238
##  9 MALE   ASIAN AND PACIFIC ISLANDER  4727
## 10 MALE   BLACK NON HISP              1261
## 11 MALE   BLACK NON HISPANIC          5083
## 12 MALE   HISPANIC                    9953
## 13 MALE   WHITE NON HISP              2276
## 14 MALE   WHITE NON HISPANIC          9377
  1. Clean the dataset.(include rename)
  1. Check missing values
missing_values <- baby_names %>%
  summarise(across(everything()) %>%
  is.na() %>% 
  sum())

print(missing_values)
##   across(everything()) %>% is.na() %>% sum()
## 1                                          0
  1. Identify and remove outliers
iqr <- IQR(baby_names$Count)
upper_limit <- quantile(baby_names$Count, 0.75) + 1.5 * iqr
lower_limit <- quantile(baby_names$Count, 0.25) - 1.5 * iqr

clean_data <- baby_names %>%
  filter(Count >= lower_limit & Count <= upper_limit)

head(clean_data)
##   Year.of.Birth Gender Ethnicity Child.s.First.Name Count Rank
## 1          2011 FEMALE  HISPANIC          GERALDINE    13   75
## 2          2011 FEMALE  HISPANIC                GIA    21   67
## 3          2011 FEMALE  HISPANIC             GIANNA    49   42
## 4          2011 FEMALE  HISPANIC            GISELLE    38   51
## 5          2011 FEMALE  HISPANIC              GRACE    36   53
## 6          2011 FEMALE  HISPANIC          GUADALUPE    26   62
  1. Sampling
sample_baby_names <- clean_data %>%
  sample_n(6921, replace = FALSE)

head(sample_baby_names)
##   Year.of.Birth Gender                  Ethnicity Child.s.First.Name Count Rank
## 1          2015   MALE ASIAN AND PACIFIC ISLANDER                Ali    13   52
## 2          2013 FEMALE         WHITE NON HISPANIC             Harper    48   45
## 3          2014 FEMALE ASIAN AND PACIFIC ISLANDER             Nicole    28   22
## 4          2014 FEMALE         BLACK NON HISPANIC             Eliana    23   28
## 5          2011   MALE         WHITE NON HISPANIC          NATHANIEL    46   62
## 6          2013   MALE         BLACK NON HISPANIC            Richard    19   53
  1. Central Tendency
  1. Year Statistics
year_stats <- baby_names %>%
  summarise(
    min_year = min(Year.of.Birth),
    max_year = max(Year.of.Birth),
    range_year = max_year - min_year,
    mean_year = mean(Year.of.Birth),
    median_year = median(Year.of.Birth),
    mode_year = mode(Year.of.Birth),
    sd_year = sd(Year.of.Birth),
    variance_year = var(Year.of.Birth)
  )
  1. Count Statistics
count_stats <- baby_names %>%
  summarise(
    min_count = min(Count),
    max_count = max(Count),
    range_count = max_count - min_count,
    mean_count = mean(Count),
    median_count = median(Count),
    mode_count = mode(Count),
    sd_count = sd(Count),
    variance_count = var(Count)
  )
  1. Rank Statistics
rank_stats <- baby_names %>%
  summarise(
    min_rank = min(Rank),
    max_rank = max(Rank),
    range_rank = max_rank - min_rank,
    mean_rank = mean(Rank),
    median_rank = median(Rank),
    mode_rank = mode(Rank),
    sd_rank = sd(Rank),
    variance_rank = var(Rank)
    
  )
  1. Gender Statistics
gender_stats <- baby_names %>%
  count(Gender) %>%
  mutate(proportion = n / sum(n))
  1. Ethnicity Statistics
ethnicity_stats <- baby_names %>%
  count(Ethnicity) %>%
  mutate(proportion = n / sum(n))
  1. Child’s First Name Statistics
name_stats <- baby_names %>%
  group_by(Child.s.First.Name) %>%
  summarise(count = n()) %>%
  arrange(desc(count))
  1. Most Frequent Names
most_frequent_names <- head(name_stats, 10)
  1. Variance and STD
print(paste("The variance of Year of Birth is:", year_stats$variance_year))
## [1] "The variance of Year of Birth is: 6.34354052556836"
print(paste("The variance of Count is:", count_stats$variance_count))
## [1] "The variance of Count is: 1507.45362417075"
print(paste("The variance of Rank is:", rank_stats$variance_rank))
## [1] "The variance of Rank is: 650.238603457949"
print(paste("The standard deviation of Year of Birth is:", year_stats$sd_year))
## [1] "The standard deviation of Year of Birth is: 2.518638625442"
print(paste("The standard deviation of Count is:", count_stats$sd_count))
## [1] "The standard deviation of Count is: 38.8259400938439"
print(paste("The standard deviation of Rank is:", rank_stats$sd_rank))
## [1] "The standard deviation of Rank is: 25.499776537412"
  1. Percentile, Quartile, IQR and boxplots
calculate_quantiles <- function(x) {
  data.frame(
    percentile_10 = quantile(x, 0.1),
    percentile_25 = quantile(x, 0.25),
    percentile_50 = quantile(x, 0.5),
    percentile_75 = quantile(x, 0.75),
    percentile_90 = quantile(x, 0.9),
    quartile_1 = quantile(x, 0.25),
    quartile_3 = quantile(x, 0.75),
    iqr = diff(quantile(x, c(0.25, 0.75)))
  )
}
baby_names %>%
  summarise(
    year_quantiles = list(calculate_quantiles(Year.of.Birth)),
    count_quantiles = list(calculate_quantiles(Count)),
    rank_quantiles = list(calculate_quantiles(Rank))
  ) %>%
  unnest(cols = c(year_quantiles, count_quantiles, rank_quantiles), names_sep = "_") %>%
  print()
## # A tibble: 1 × 24
##   year_quantiles_percentile_10 year_quantiles_percentil…¹ year_quantiles_perce…²
##                          <dbl>                      <dbl>                  <dbl>
## 1                         2011                       2012                   2013
## # ℹ abbreviated names: ¹​year_quantiles_percentile_25,
## #   ²​year_quantiles_percentile_50
## # ℹ 21 more variables: year_quantiles_percentile_75 <dbl>,
## #   year_quantiles_percentile_90 <dbl>, year_quantiles_quartile_1 <dbl>,
## #   year_quantiles_quartile_3 <dbl>, year_quantiles_iqr <dbl>,
## #   count_quantiles_percentile_10 <dbl>, count_quantiles_percentile_25 <dbl>,
## #   count_quantiles_percentile_50 <dbl>, count_quantiles_percentile_75 <dbl>, …

Box plots

ggplot(baby_names, aes(x = 1, y = Year.of.Birth)) +
  geom_boxplot() +
  labs(title = "Boxplot of Year of Birth")

ggplot(baby_names, aes(x = 1, y = Count)) +
  geom_boxplot() +
  labs(title = "Boxplot of Count")

ggplot(baby_names, aes(x = 1, y = Rank)) +
  geom_boxplot() +
  labs(title = "Boxplot of Rank")

  1. Plots, Distribution and skewness

a1) Distribution of Year of Birth

ggplot(baby_names, aes(x = Year.of.Birth)) +
  geom_histogram(binwidth = 1) +
  labs(title = "Histogram of Year of Birth")

a2) Distribution of Count

ggplot(baby_names, aes(x = Count)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Histogram of Count")

a3) Distribution of Rank

ggplot(baby_names, aes(x = Rank)) +
  geom_histogram(binwidth = 5) +
  labs(title = "Histogram of Rank")

b) Skewness

print("Distribution of Year of Birth is skewed to the right")
## [1] "Distribution of Year of Birth is skewed to the right"
print("Distribution of Distribution of Count is J-shaped")
## [1] "Distribution of Distribution of Count is J-shaped"
print("Distribution of Rank is multimodal ")
## [1] "Distribution of Rank is multimodal "
  1. Plots
ggplot(baby_names, aes(x = Count, y = Rank)) +
  geom_point() +
  labs(title = "Scatter Plot of Count vs. Rank")

ggplot(baby_names, aes(x = Year.of.Birth, y = Count)) +
  geom_point() +
  labs(title = "Scatter Plot of Year of Birth vs. Count")

ggplot(baby_names, aes(x = Gender, y = Count)) +
  geom_boxplot() +
  labs(title = "Box Plot of Count by Gender")

ggplot(baby_names, aes(x = Ethnicity, y = Count)) +
  geom_bar(stat = "summary", fun = "mean") +
  labs(title = "Bar Plot of Average Count by Ethnicity")

ggplot(baby_names, aes(x = Year.of.Birth)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Density Plot of Year of Birth")

ggplot(baby_names, aes(x = Count)) +
  geom_density(fill = "green", alpha = 0.5) +
  labs(title = "Density Plot of Count")

ggplot(baby_names, aes(x = Gender)) +
  geom_bar(fill = "orange") +
  labs(title = "Bar Plot of Gender")

ggplot(baby_names, aes(x = Ethnicity)) +
  geom_bar(fill = "purple") +
  labs(title = "Bar Plot of Ethnicity")

  1. Confidence interval Effect size, Power and Significance testing a)Confidence Interval
confidence_level <- 0.95

confidence_intervals <- baby_names %>%
  group_by(Gender) %>%
  summarize(
    mean_count = mean(Count),
    sd_count = sd(Count),
    n = n(),
    lower_ci = mean_count - qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n),
    upper_ci = mean_count + qt(1 - confidence_level/2, df = n - 1) * sd_count / sqrt(n)
  )

print(confidence_intervals)
## # A tibble: 2 × 6
##   Gender mean_count sd_count     n lower_ci upper_ci
##   <chr>       <dbl>    <dbl> <int>    <dbl>    <dbl>
## 1 FEMALE       29.1     31.2 35299     29.0     29.1
## 2 MALE         38.7     44.9 33915     38.7     38.7
  1. Cohen’s d
cohen_d <- baby_names %>%
  group_by(Gender) %>%
  summarize(
    mean_count = mean(Count),
    sd_count = sd(Count),
    n = n()
  ) %>%
  mutate(
    pooled_sd = sqrt(((n[1] - 1) * sd_count[1]^2 + (n[2] - 1) * sd_count[2]^2) / (n[1] + n[2] - 2)),
    cohen_d = (mean_count[1] - mean_count[2]) / pooled_sd
  )

print(cohen_d$cohen_d)
## [1] -0.2503134 -0.2503134
  1. Power Analysis
power_analysis <- pwr.t.test(n = NULL, d = 0.5, sig.level = 0.05, power = 0.8)

print(power_analysis)
## 
##      Two-sample t test power calculation 
## 
##               n = 63.76561
##               d = 0.5
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group
  1. Hypothesis Testing ( min of 2 and max of 3 hypothesis)
  1. Define Null and Alternative with the tailed tests

a1) Null: There is no significant difference in the popularity of names across different birth years. Alternative: The popularity of names has changed significantly over time. Tailed test: Two-tailed

a2) Null: There is no significant difference in the popularity of names between males and females. Alternative: The popularity of names varies significantly between males and females. Tailed test: Two-tailed

a3) Null: There is no significant difference in the popularity of names across different ethnicities. Alternative: The popularity of names varies significantly across different ethnicities. Tailed test: Two-tailed

  1. Do the test

b1) Hypothesis 1

model <- baby_names %>%
  lm(Count ~ Year.of.Birth, data = .)
p_value <- summary(model)$coefficients[2, 4]

print(p_value)
## [1] 5.516385e-15

b2) Hypothesis 2

t_test <- baby_names %>%
  t.test(Count ~ Gender, data = .)

print(t_test)
## 
##  Welch Two Sample t-test
## 
## data:  Count by Gender
## t = -32.692, df = 60239, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FEMALE and group MALE is not equal to 0
## 95 percent confidence interval:
##  -10.221682  -9.065346
## sample estimates:
## mean in group FEMALE   mean in group MALE 
##             29.05402             38.69754

b3) Hypothesis 3

anova <- baby_names %>%
  aov(Count ~ Ethnicity, data = .)

print(summary(anova))
##                Df    Sum Sq Mean Sq F value Pr(>F)    
## Ethnicity       6   1969743  328290   221.9 <2e-16 ***
## Residuals   69207 102365645    1479                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. Conclusion

c1) Reject the null hypothesis and accept the alternative c2) Reject the null hypothesis and accept the alternative c3) Reject the null hypothesis and accept the alternative