#loading necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
#reading the csv file
df_income <- read.csv("./censusincome.csv")
Null Hypothesis (\(H_0\)):
Alternative Hypothesis (\(H_1\)):
Using power analysis, we evaluate whether we have enough amount of data to detect the desired true difference between out proportions.
We choose a power of 80%, meaning we want an 80% chance of detecting a true effect in our data set if it exists. This implies \(\beta = 0.20\) which is essentially our Type II error i.e. failing to rejecting a false null hypothesis.
#load the pwr package
library(pwr)
#We pick a cohen's value of 0.2, implying we want to detect small effect size
pwr.2p.test(h=0.2, sig.level = 0.05, power = 0.80)
##
## Difference of proportion power calculation for binomial distribution (arcsine transformation)
##
## h = 0.2
## n = 392.443
## sig.level = 0.05
## power = 0.8
## alternative = two.sided
##
## NOTE: same sample sizes
df_income_power_size <- df_income |>
group_by(sex, income) |>
summarise(count = n())
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
df_income_power_size
# Group by income and gender, and count the number of individuals in each group
df_income_grouped <- df_income |>
group_by(income, sex) |>
summarise(Count = n())
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.
# Extract the counts for each income group and gender
male_greater_50K <- df_income_grouped$Count[df_income_grouped$sex == " Male" & df_income_grouped$income == " >50K"]
female_greater_50K <- df_income_grouped$Count[df_income_grouped$sex == " Female" & df_income_grouped$income == " >50K"]
# Calculate the total number of males and females
total_male <- sum(df_income_grouped$Count[df_income_grouped$sex == " Male"])
total_female <- sum(df_income_grouped$Count[df_income_grouped$sex == " Female"])
# Perform the two-proportion z-test
prop_test <- prop.test(x = c(male_greater_50K, female_greater_50K), n = c(total_male, total_female))
# Print the result of the two-proportion z-test
print(prop_test)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(male_greater_50K, female_greater_50K) out of c(total_male, total_female)
## X-squared = 1517.8, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.1877104 0.2048416
## sample estimates:
## prop 1 prop 2
## 0.3057366 0.1094606
From the results above we see that \(\mathbf{X^2 = 1517.8}\) which is indicating a significantly high difference between the observed and the expected proportions in our sample data.
We have 1 degree of freedom which indicate the number of independent comparisons we’re making. In our case we’re comparing the proportion of men who earn above 50K and the proportion of women who earn above 50K, and hence the degree of freedom is 1.
Our p-value is less than \(2.2e-16\) which is extremely small, much smaller compared to \(\alpha = 0.05\) which implies that there is a strong evidence against the null hypothesis. This consequently translates that a huge difference in proportions of men earning above 50k and proportions of women earning 50k.
Lastly we have our confidence level being [0.1877, 0.2048] indicating that we are 95% confidence that the difference in proportions of men and women above the 50K income bracket is between \(18.8\%\) and \(20.48\%\). We note that \(0\) is not part of that interval. A z-statistic of zero would imply there is zero difference in women proportion earning above 50K and men proportion earning above 50k bracket, hence we don’t have a null case represented in our interval.
Our sample estimates men earning above 50K to 30.6% and women earning above 50K to 10.9%.
Our conclusion - Based on the above results and the strong case against the null hypothesis, we reject the null hypothesis, and accept the alternate hypothesis that poses that there is a difference in proportion of men and women in the above 50K income bracket per year.
We can Visualize our hypothesis as shown below
# Define the Z-distribution curve
z_values <- seq(-4, 4, by = 0.01)
density_values <- dnorm(z_values)
# Define the observed sample estimates and confidence intervals from your results
sample_estimate <- 0.3057366 - 0.1094606 # Difference in proportions
lower_ci <- 0.1877104
upper_ci <- 0.2048416
alpha <- 0.05 # Significance level
# Create the plot
ggplot(data.frame(z = z_values, density = density_values), aes(x = z, y = density)) +
geom_line(size = 1.0, color = "blue") + # Z-distribution curve
geom_area(data = subset(data.frame(z = z_values, density = density_values), z > qnorm(1 - alpha/2)), aes(x = z), fill = "red", alpha = 0.3) + # Upper alpha region
geom_area(data = subset(data.frame(z = z_values, density = density_values), z < qnorm(alpha/2)), aes(x = z), fill = "red", alpha = 0.3) + # Lower alpha region
geom_vline(xintercept = sample_estimate, color = "green", linetype = "dashed", size = 1, show.legend = TRUE) + # Sample estimate
geom_vline(xintercept = lower_ci, color = "black", linetype = "dotted", size = 1, show.legend = TRUE) + # Lower CI
geom_vline(xintercept = upper_ci, color = "black", linetype = "dotted", size = 1, show.legend = TRUE) + # Upper CI
labs(title = "Z-Distribution with Confidence Interval, Alpha, and Sample Estimate", x = "Z-Score", y = "Density") +
annotate("text", x = sample_estimate + 0.1, y = 0.3, label = paste("Sample Estimate:", round(sample_estimate, 3)), color = "green", size = 4) +
annotate("text", x = lower_ci - 1.2, y = 0.05, label = paste("Lower CI:", round(lower_ci, 3)), color = "black", size = 4) +
annotate("text", x = upper_ci + 1.2, y = 0.05, label = paste("Upper CI:", round(upper_ci, 3)), color = "black", size = 4) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
\[ z = \sqrt{X^2} \]
which gives us a z statistic of \(\approx 39\). This value is way deep into the rejection region of the plot show above. Again, emphasizing the need to reject the null hypothesis that states there is no difference in income levels between the gender.
Null Hypothesis (\(H_0)\): The proportion of individuals earning above 50K is the same across different education levels.
Alternate Hypothesis (\(H_1\)): The proportion of individuals earning above 50K is NOT the same across different education levels.
# Group the data by education and income and count the occurrences
df_income_grouped <- df_income |>
group_by(education, income) |>
summarise(Count = n())
## `summarise()` has grouped output by 'education'. You can override using the
## `.groups` argument.
# Create a contingency table for education and income
c_table_edu_income <- table(df_income$education, df_income$income)
# Perform the Chi-Square Test of Independence
chi_test <- chisq.test(c_table_edu_income)
# Print the results of the Chi-Square test
print(chi_test)
##
## Pearson's Chi-squared test
##
## data: c_table_edu_income
## X-squared = 4429.7, df = 15, p-value < 2.2e-16
We have the X-Squared statistic as 4429.7 which essentially measures how much our expected proportions as defined in the null hypothesis differ from the observed proportions in our data set. Again, just like in the first relationship we investigated, we have a strong case here against the null hypothesis.
The other piece of information we have is the degrees of freedom which is 15 in our case, which is a product of the income categories and the education categories. i.e.
\[ d_f = (\text{no. of education categories} - 1) \times (\text{no of income categories} - 1) \]
We have our p-value of \(< 2.2e-16\) which is really small, and thus the probability of finding an observation in the null space is extremely low.
The conclusion here is to reject the null hypothesis due to the small value of p which suggests a significant relationship between education level and income level. Therefore, we are essentially concluding that an individual’s income level (whether below or above 50K) will vary based on their education levels.
The visualization of our chi-square test is a follows:
# Define the degrees of freedom and alpha level
df <- 15 # Degrees of freedom based on your test result
alpha <- 0.05 # Significance level
# Create the sequence for Chi-square values
x_values <- seq(0, 40, by = 0.01) # Adjust the upper limit based on your data
chi_sq_values <- dchisq(x_values, df)
# Calculate the critical value for 95% confidence level
critical_value <- qchisq(1 - alpha, df)
# Create the plot
ggplot(data.frame(x = x_values, y = chi_sq_values), aes(x = x, y = y)) +
geom_line(size = 1.2, color = "black") + # Chi-square distribution curve
geom_area(data = subset(data.frame(x = x_values, y = chi_sq_values), x > critical_value), aes(x = x, y = y), fill = "red", alpha = 0.5) + # Rejection region
geom_vline(xintercept = critical_value, linetype = "dashed", color = "blue", size = 1) + # Critical value line
annotate("text", x = critical_value + 2, y = 0.02, label = paste("Critical Value =", round(critical_value, 2)), color = "blue", size = 3) +
annotate("text", x = 10, y = 0.04, label = "1 - c (Confidence Level)", color = "black", size = 3) +
annotate("text", x = critical_value + 8, y = 0.005, label = "Rejection Region", color = "red", size = 3) +
labs(title = "Chi-Square Distribution", x = expression(chi^2), y = "Density") +
theme_minimal()