Assignment 3 Final

Loading Scripts

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ipumsr)
library(ggplot2)
library(usmap)
library(ggthemes)
library(ggpubr)
library(kableExtra)


Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows

library(tidyr)

Setting up the Workspace

setwd("~/Desktop/econ code/assignment 3/")

Loading in IPUMS Data

ddi <- read_ipums_ddi("usa_00004.xml")
ipums_data <- read_ipums_micro(
  ddi,
  data_file = "/Users/zahrahussain/Desktop/Econ Code/Assignment 3/usa_00004.dat.gz"
)

Use of data from IPUMS USA is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.

Data Cleaning and Preparation

ipums_data <- ipums_data %>%
  mutate(
    YEAR = as.integer(YEAR),
    STATEFIP = as.character(STATEFIP),  # Convert to character for mapping
    AGE = as.integer(AGE),
    SEX = factor(SEX, labels = c("Male", "Female")),
    URBAN = as.character(URBAN),
    BPL = as.integer(BPL),  # Ensure Birthplace is numeric
    PERWT = as.numeric(PERWT),  # Ensure weight is numeric
    IND1950 = as.character(IND1950)  # Convert industry classification to character
  )

Specifying Groups

india_birthplace_codes <- c(521)
ipums_data <- ipums_data %>%
  mutate(india_born = if_else(BPL %in% india_birthplace_codes, 1, 0))

general_population_data <- ipums_data %>%
  filter(BPL != 521)  # Exclude India-born individuals

Population Pyramids Preparation

For India-born data and overall populatio

prepare_pyramid_data <- function(df, group_label = "Group") {
  df %>%
    filter(!is.na(AGE), !is.na(SEX)) %>%
    group_by(AGE, SEX) %>%
    summarise(population = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(group = group_label)
}

india_born_data <- ipums_data %>% filter(india_born == 1)
pyramid_india_born <- prepare_pyramid_data(india_born_data, "India-born")
pyramid_general_pop <- prepare_pyramid_data(ipums_data, "All")

Creating the Pyramids

plot_pyramid <- function(pyramid_df, title_str) {
  pyramid_df %>%
    mutate(population = if_else(SEX == "Male", -population, population)) %>%
    ggplot(aes(x = AGE, y = population, fill = SEX)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    scale_y_continuous(labels = abs) +
    scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
    labs(
      title = title_str,
      x = "Age",
      y = "Weighted Population",
      fill = "Sex"
    ) +
    theme_minimal()
}

pyramid_india_plot <- plot_pyramid(pyramid_india_born, "Population Pyramid: India-born (1900 & 1910)")
pyramid_general_plot <- plot_pyramid(pyramid_general_pop, "Population Pyramid: General Population (1900 & 1910)")

pyramid_india_plot

pyramid_general_plot

Labor force

labforce_labels <- c(
  "0" = "N/A",
  "1" = "No, not in the labor force",
  "2" = "Yes, in the labor force",
  "9" = "Unclassifiable"
)

# Function to process labor force data
process_labor_force <- function(data) {
  data %>%
    group_by(YEAR, LABFORCE) %>%
    summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(
      LABFORCE = labforce_labels[as.character(LABFORCE)],  # Map labels
      Total = sum(count),  # Compute total for each year
      Percentage = (count / Total) * 100  # Compute percentage
    ) %>%
    select(YEAR, LABFORCE, count, Percentage) %>%
    pivot_wider(names_from = YEAR, values_from = c(count, Percentage), names_glue = "{YEAR}_{.value}") %>%
    mutate(
      `1900` = ifelse(!is.na(`1900_count`), paste0(`1900_count`, " (", round(`1900_Percentage`, 2), "%)"), "N/A"),
      `1910` = ifelse(!is.na(`1910_count`), paste0(`1910_count`, " (", round(`1910_Percentage`, 2), "%)"), "N/A")
    ) %>%
    select(LABFORCE, `1900`, `1910`)  # Keep only relevant columns
}

# Ensure the general population data is defined
if (!exists("general_population_data")) {
  general_population_data <- ipums_data  # Use full dataset for everyone
}

# Process labor force data for India-born individuals
labor_force_india <- india_born_data %>%
  process_labor_force() %>%
  rename(`1900 India-Born` = `1900`, `1910 India-Born` = `1910`)

# Process labor force data for the general population
labor_force_general <- general_population_data %>%
  process_labor_force() %>%
  rename(`1900 General` = `1900`, `1910 General` = `1910`)

# Merge both datasets to create a single comparison table
labor_force_comparison <- labor_force_general %>%
  left_join(labor_force_india, by = "LABFORCE")

# Display comparison table with kable
kable(labor_force_comparison, digits = 2, caption = "Comparison of Labor Force Status Between India-Born and General Population (1900 & 1910)")

Comparison of Labor Force Status Between India-Born and General Population (1900 & 1910)
LABFORCE	1900 General	1910 General	1900 India-Born	1910 India-Born
N/A	27603280 (16.39%)	31365865 (18.62%)	501 (7.04%)	502 (7.05%)
No, not in the labor force	21026481 (12.49%)	24769361 (14.71%)	901 (12.66%)	1103 (15.5%)
Yes, in the labor force	27437775 (16.29%)	36210178 (21.5%)	602 (8.46%)	3509 (49.3%)

Occupations

# Define a mapping of the specified OCC codes to occupation labels
occupation_labels <- c(
  "999" = "N/A or None Reported",
  "12"  = "Farmers (owners and tenants)",
  "46"  = "Carpenters",
  "304" = "Bookkeepers and accounting clerks",
  "91"  = "Truck, delivery, and tractor drivers",
  "988" = "Laborers, except construction",
  "994" = "Not specified service workers",
  "196" = "Clerical and kindred workers",
  "62"  = "Machinists",
  "74"  = "Electricians",
  "980" = "Operatives and kindred workers"
)

# Filter data for individuals aged 10+
filtered_data <- ipums_data %>% filter(AGE >= 10)

# Compute occupation distribution for the India-born group
occupation_group <- filtered_data %>%
  filter(BPL == 521) %>%  # India is coded as 521
  count(OCC, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Occupation = occupation_labels[as.character(OCC)]  # Map only specified labels
  ) %>%
  filter(!is.na(Occupation)) %>%  # Keep only specified occupations
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(OCC, Occupation, n, Percentage)  # Keep only relevant columns

# Compute occupation distribution for the general population
occupation_population <- filtered_data %>%
  count(OCC, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Occupation = occupation_labels[as.character(OCC)]  # Map only specified labels
  ) %>%
  filter(!is.na(Occupation)) %>%  # Keep only specified occupations
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(OCC, Occupation, n, Percentage)  # Keep only relevant columns

# Display tables
kable(occupation_group, digits = 2, caption = "Top 10 Occupations of India-Born Individuals (Age 10+)")

Top 10 Occupations of India-Born Individuals (Age 10+)
OCC	Occupation	n	Percentage
999	N/A or None Reported	2005	29.85
12	Farmers (owners and tenants)	603	8.98
46	Carpenters	502	7.47
304	Bookkeepers and accounting clerks	500	7.44
91	Truck, delivery, and tractor drivers	201	2.99
988	Laborers, except construction	201	2.99
994	Not specified service workers	201	2.99
196	Clerical and kindred workers	200	2.98
62	Machinists	101	1.50
74	Electricians	101	1.50

kable(occupation_population, digits = 2, caption = "Top 10 Occupations of the General Population (Age 10+)")

Top 10 Occupations of the General Population (Age 10+)
OCC	Occupation	n	Percentage
999	N/A or None Reported	54506584	41.95
304	Bookkeepers and accounting clerks	6892037	5.30
12	Farmers (owners and tenants)	2207438	1.70
196	Clerical and kindred workers	1442506	1.11
994	Not specified service workers	685775	0.53
91	Truck, delivery, and tractor drivers	513244	0.40
988	Laborers, except construction	459118	0.35
46	Carpenters	208899	0.16
62	Machinists	205815	0.16
980	Operatives and kindred workers	97883	0.08

Industries

# Define a mapping of the specified IND1950 codes to industry labels
industry_labels <- c(
  "0"   = "N/A or None Reported",
  "105" = "Agriculture",
  "826" = "Private Households",
  "246" = "Construction",
  "506" = "Railroads and Railway Express Service",
  "997" = "Nonclassifiable",
  "636" = "Food Stores, Except Dairy Products",
  "888" = "Educational Services",
  "439" = "Yarn, Thread, and Fabric Mills",
  "216" = "Coal Mining"
)

# Filter data for individuals aged 10+
filtered_data <- ipums_data %>% filter(AGE >= 10)

# Compute industry distribution for the India-born group
industry_group <- filtered_data %>%
  filter(BPL == 521) %>%  # India is coded as 521
  count(IND1950, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Industry = industry_labels[as.character(IND1950)]  # Map only specified labels
  ) %>%
  filter(!is.na(Industry)) %>%  # Keep only specified industries
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(IND1950, Industry, n, Percentage)  # Keep only relevant columns

# Compute industry distribution for the general population
industry_population <- filtered_data %>%
  count(IND1950, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Industry = industry_labels[as.character(IND1950)]  # Map only specified labels
  ) %>%
  filter(!is.na(Industry)) %>%  # Keep only specified industries
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(IND1950, Industry, n, Percentage)  # Keep only relevant columns

# Display tables
kable(industry_group, digits = 2, caption = "Top 10 Industries of India-Born Individuals (Age 10+)")

Top 10 Industries of India-Born Individuals (Age 10+)
IND1950	Industry	n	Percentage
0	N/A or None Reported	2805	41.77
105	Agriculture	1205	17.94
246	Construction	101	1.50
826	Private Households	100	1.49
888	Educational Services	100	1.49

kable(industry_population, digits = 2, caption = "Top 10 Industries of the General Population (Age 10+)")

Top 10 Industries of the General Population (Age 10+)
IND1950	Industry	n	Percentage
0	N/A or None Reported	64287504	49.48
105	Agriculture	22809334	17.55
826	Private Households	3872636	2.98
246	Construction	3304884	2.54
506	Railroads and Railway Express Service	2435376	1.87
997	Nonclassifiable	1849958	1.42
636	Food Stores, Except Dairy Products	1434992	1.10
888	Educational Services	1319729	1.02
439	Yarn, Thread, and Fabric Mills	1081252	0.83
216	Coal Mining	979816	0.75

Urban/Rural Comparison

# Define a mapping for URBAN codes (adjust based on dataset)
urban_labels <- c(
  "0" = "N/A",
  "1" = "Rural",
  "2" = "Urban"
)

# Function to process urban vs. rural data
process_urban_rural <- function(data) {
  data %>%
    group_by(YEAR, URBAN) %>%
    summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(
      URBAN = urban_labels[as.character(URBAN)],  # Map labels
      Total = sum(count),  # Compute total for each year
      Percentage = (count / Total) * 100  # Compute percentage
    ) %>%
    select(YEAR, URBAN, count, Percentage) %>%
    pivot_wider(names_from = YEAR, values_from = c(count, Percentage), names_glue = "{YEAR}_{.value}") %>%
    mutate(
      `1900` = ifelse(!is.na(`1900_count`), paste0(`1900_count`, " (", round(`1900_Percentage`, 2), "%)"), "N/A"),
      `1910` = ifelse(!is.na(`1910_count`), paste0(`1910_count`, " (", round(`1910_Percentage`, 2), "%)"), "N/A")
    ) %>%
    select(URBAN, `1900`, `1910`)  # Keep only relevant columns
}

# Ensure the general population data is defined
if (!exists("general_population_data")) {
  general_population_data <- ipums_data  # Use full dataset for everyone
}

# Process urban/rural data for India-born individuals
urban_rural_india <- india_born_data %>%
  process_urban_rural() %>%
  rename(`1900 India-Born` = `1900`, `1910 India-Born` = `1910`)

# Process urban/rural data for the general population
urban_rural_general <- general_population_data %>%
  process_urban_rural() %>%
  rename(`1900 General` = `1900`, `1910 General` = `1910`)

# Merge both datasets to create a single comparison table
urban_rural_comparison <- urban_rural_general %>%
  left_join(urban_rural_india, by = "URBAN")

# Display comparison table with kable
kable(urban_rural_comparison, digits = 2, caption = "Comparison of Urban/Rural Distribution Between India-Born and General Population (1900 & 1910)")

Comparison of Urban/Rural Distribution Between India-Born and General Population (1900 & 1910)
URBAN	1900 General	1910 General	1900 India-Born	1910 India-Born
Rural	46638181 (27.69%)	50952346 (30.25%)	400 (5.62%)	2006 (28.18%)
Urban	29429355 (17.47%)	41393058 (24.58%)	1604 (22.53%)	3108 (43.66%)

Nativity

# Define a mapping for NATIVITY codes
nativity_labels <- c(
  "0" = "N/A or Unknown",
  "1" = "Both Parents Native-Born",
  "2" = "Father Foreign, Mother Native",
  "3" = "Mother Foreign, Father Native",
  "4" = "Both Parents Foreign",
  "5" = "Foreign-Born"
)

# Function to process nativity data
process_nativity <- function(data) {
  data %>%
    group_by(YEAR, NATIVITY) %>%
    summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(
      NATIVITY = nativity_labels[as.character(NATIVITY)],  # Map labels
      Total = sum(count),  # Compute total for each year
      Percentage = (count / Total) * 100  # Compute percentage
    ) %>%
    select(YEAR, NATIVITY, count, Percentage) %>%
    pivot_wider(names_from = YEAR, values_from = c(count, Percentage), names_glue = "{YEAR}_{.value}") %>%
    mutate(
      `1900` = ifelse(!is.na(`1900_count`), paste0(`1900_count`, " (", round(`1900_Percentage`, 2), "%)"), "N/A"),
      `1910` = ifelse(!is.na(`1910_count`), paste0(`1910_count`, " (", round(`1910_Percentage`, 2), "%)"), "N/A")
    ) %>%
    select(NATIVITY, `1900`, `1910`)  # Keep only relevant columns
}

# Ensure the general population data is defined
if (!exists("general_population_data")) {
  general_population_data <- ipums_data  # Use full dataset for everyone
}

# Process nativity data for India-born individuals
nativity_india <- india_born_data %>%
  process_nativity() %>%
  rename(`1900 India-Born` = `1900`, `1910 India-Born` = `1910`)

# Process nativity data for the general population
nativity_general <- general_population_data %>%
  process_nativity() %>%
  rename(`1900  General` = `1900`, `1910 General` = `1910`)

# Merge both datasets to create a single comparison table
nativity_comparison <- nativity_general %>%
  left_join(nativity_india, by = "NATIVITY")

# Display comparison table with kable
kable(nativity_comparison, digits = 2, caption = "Comparison of Nativity Between India-Born and General Population (1900 & 1910)")

Comparison of Nativity Between India-Born and General Population (1900 & 1910)
NATIVITY	1900 General	1910 General	1900 India-Born	1910 India-Born
Both Parents Native-Born	49808238 (29.58%)	59242530 (35.18%)	NA	NA
Father Foreign, Mother Native	3496849 (2.08%)	4234224 (2.51%)	NA	NA
Mother Foreign, Father Native	1673298 (0.99%)	2227348 (1.32%)	NA	NA
Both Parents Foreign	10605032 (6.3%)	12971464 (7.7%)	NA	NA
Foreign-Born	10484119 (6.23%)	13669838 (8.12%)	2004 (28.15%)	5114 (71.85%)

Geographic Distribution (For Mapping)

geo_distribution_data <- ipums_data %>%
  group_by(STATEFIP) %>%
  summarise(
    total_state_population = sum(PERWT, na.rm = TRUE),  # Total weighted population in the state
    india_population = sum(if_else(india_born == 1, PERWT, 0), na.rm = TRUE),  # India-born weighted population
    india_prop_of_group = india_population / sum(india_population),  # Share of total India-born in each state
    india_prop_in_state = india_population / total_state_population  # % of state's population that is India-born
  ) %>%
  ungroup()

# Convert `STATEFIP` to full state names for clarity
geo_distribution_data <- geo_distribution_data %>%
  mutate(State = state.name[match(STATEFIP, state.abb)])

Mapping

geo_distribution_data <- geo_distribution_data %>%
  mutate(fips = as.numeric(STATEFIP))

map_india_population <- plot_usmap(data = geo_distribution_data, values = "india_population", regions = "states") +
  scale_fill_continuous(name = "India-born Pop.", low = "lightblue", high = "darkblue", label = scales::comma) +
  labs(title = "Number of India-born Individuals by State (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher number of India-born residents") +
  theme_minimal()

map_india_share <- plot_usmap(data = geo_distribution_data, values = "india_prop_of_group", regions = "states") +
  scale_fill_continuous(name = "Share of India-born (%)", low = "lightgreen", high = "darkgreen", label = scales::percent) +
  labs(title = "Proportion of India-born Individuals in Each State (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher proportion of the total India-born population") +
  theme_minimal()

map_india_relative <- plot_usmap(data = geo_distribution_data, values = "india_prop_in_state", regions = "states") +
  scale_fill_continuous(name = "India-born as % of State Pop.", low = "lightpink", high = "darkred", label = scales::percent) +
  labs(title = "Proportion of State Population That is India-born (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher proportion of India-born residents in the state population") +
  theme_minimal()

map_india_population

map_india_share

map_india_relative