Assignment 3 Final

Loading Scripts

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ipumsr)
library(ggplot2)
library(usmap)
library(ggthemes)
library(ggpubr)
library(kableExtra)

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows
library(tidyr)

Setting up the Workspace

setwd("~/Desktop/econ code/assignment 3/")

Loading in IPUMS Data

ddi <- read_ipums_ddi("usa_00004.xml")
ipums_data <- read_ipums_micro(
  ddi,
  data_file = "/Users/zahrahussain/Desktop/Econ Code/Assignment 3/usa_00004.dat.gz"
)
Use of data from IPUMS USA is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.

Data Cleaning and Preparation

ipums_data <- ipums_data %>%
  mutate(
    YEAR = as.integer(YEAR),
    STATEFIP = as.character(STATEFIP),  # Convert to character for mapping
    AGE = as.integer(AGE),
    SEX = factor(SEX, labels = c("Male", "Female")),
    URBAN = as.character(URBAN),
    BPL = as.integer(BPL),  # Ensure Birthplace is numeric
    PERWT = as.numeric(PERWT),  # Ensure weight is numeric
    IND1950 = as.character(IND1950)  # Convert industry classification to character
  )

Specifying Groups

india_birthplace_codes <- c(521)
ipums_data <- ipums_data %>%
  mutate(india_born = if_else(BPL %in% india_birthplace_codes, 1, 0))

general_population_data <- ipums_data %>%
  filter(BPL != 521)  # Exclude India-born individuals

Population Pyramids Preparation

For India-born data and overall populatio
prepare_pyramid_data <- function(df, group_label = "Group") {
  df %>%
    filter(!is.na(AGE), !is.na(SEX)) %>%
    group_by(AGE, SEX) %>%
    summarise(population = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(group = group_label)
}

india_born_data <- ipums_data %>% filter(india_born == 1)
pyramid_india_born <- prepare_pyramid_data(india_born_data, "India-born")
pyramid_general_pop <- prepare_pyramid_data(ipums_data, "All")

Creating the Pyramids

plot_pyramid <- function(pyramid_df, title_str) {
  pyramid_df %>%
    mutate(population = if_else(SEX == "Male", -population, population)) %>%
    ggplot(aes(x = AGE, y = population, fill = SEX)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    scale_y_continuous(labels = abs) +
    scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
    labs(
      title = title_str,
      x = "Age",
      y = "Weighted Population",
      fill = "Sex"
    ) +
    theme_minimal()
}

pyramid_india_plot <- plot_pyramid(pyramid_india_born, "Population Pyramid: India-born (1900 & 1910)")
pyramid_general_plot <- plot_pyramid(pyramid_general_pop, "Population Pyramid: General Population (1900 & 1910)")

pyramid_india_plot

pyramid_general_plot

Labor force

labforce_labels <- c(
  "0" = "N/A",
  "1" = "No, not in the labor force",
  "2" = "Yes, in the labor force",
  "9" = "Unclassifiable"
)

# Function to process labor force data
process_labor_force <- function(data) {
  data %>%
    group_by(YEAR, LABFORCE) %>%
    summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(
      LABFORCE = labforce_labels[as.character(LABFORCE)],  # Map labels
      Total = sum(count),  # Compute total for each year
      Percentage = (count / Total) * 100  # Compute percentage
    ) %>%
    select(YEAR, LABFORCE, count, Percentage) %>%
    pivot_wider(names_from = YEAR, values_from = c(count, Percentage), names_glue = "{YEAR}_{.value}") %>%
    mutate(
      `1900` = ifelse(!is.na(`1900_count`), paste0(`1900_count`, " (", round(`1900_Percentage`, 2), "%)"), "N/A"),
      `1910` = ifelse(!is.na(`1910_count`), paste0(`1910_count`, " (", round(`1910_Percentage`, 2), "%)"), "N/A")
    ) %>%
    select(LABFORCE, `1900`, `1910`)  # Keep only relevant columns
}

# Ensure the general population data is defined
if (!exists("general_population_data")) {
  general_population_data <- ipums_data  # Use full dataset for everyone
}

# Process labor force data for India-born individuals
labor_force_india <- india_born_data %>%
  process_labor_force() %>%
  rename(`1900 India-Born` = `1900`, `1910 India-Born` = `1910`)

# Process labor force data for the general population
labor_force_general <- general_population_data %>%
  process_labor_force() %>%
  rename(`1900 General` = `1900`, `1910 General` = `1910`)

# Merge both datasets to create a single comparison table
labor_force_comparison <- labor_force_general %>%
  left_join(labor_force_india, by = "LABFORCE")

# Display comparison table with kable
kable(labor_force_comparison, digits = 2, caption = "Comparison of Labor Force Status Between India-Born and General Population (1900 & 1910)")
Comparison of Labor Force Status Between India-Born and General Population (1900 & 1910)
LABFORCE 1900 General 1910 General 1900 India-Born 1910 India-Born
N/A 27603280 (16.39%) 31365865 (18.62%) 501 (7.04%) 502 (7.05%)
No, not in the labor force 21026481 (12.49%) 24769361 (14.71%) 901 (12.66%) 1103 (15.5%)
Yes, in the labor force 27437775 (16.29%) 36210178 (21.5%) 602 (8.46%) 3509 (49.3%)

Occupations

# Define a mapping of the specified OCC codes to occupation labels
occupation_labels <- c(
  "999" = "N/A or None Reported",
  "12"  = "Farmers (owners and tenants)",
  "46"  = "Carpenters",
  "304" = "Bookkeepers and accounting clerks",
  "91"  = "Truck, delivery, and tractor drivers",
  "988" = "Laborers, except construction",
  "994" = "Not specified service workers",
  "196" = "Clerical and kindred workers",
  "62"  = "Machinists",
  "74"  = "Electricians",
  "980" = "Operatives and kindred workers"
)

# Filter data for individuals aged 10+
filtered_data <- ipums_data %>% filter(AGE >= 10)

# Compute occupation distribution for the India-born group
occupation_group <- filtered_data %>%
  filter(BPL == 521) %>%  # India is coded as 521
  count(OCC, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Occupation = occupation_labels[as.character(OCC)]  # Map only specified labels
  ) %>%
  filter(!is.na(Occupation)) %>%  # Keep only specified occupations
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(OCC, Occupation, n, Percentage)  # Keep only relevant columns

# Compute occupation distribution for the general population
occupation_population <- filtered_data %>%
  count(OCC, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Occupation = occupation_labels[as.character(OCC)]  # Map only specified labels
  ) %>%
  filter(!is.na(Occupation)) %>%  # Keep only specified occupations
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(OCC, Occupation, n, Percentage)  # Keep only relevant columns

# Display tables
kable(occupation_group, digits = 2, caption = "Top 10 Occupations of India-Born Individuals (Age 10+)")
Top 10 Occupations of India-Born Individuals (Age 10+)
OCC Occupation n Percentage
999 N/A or None Reported 2005 29.85
12 Farmers (owners and tenants) 603 8.98
46 Carpenters 502 7.47
304 Bookkeepers and accounting clerks 500 7.44
91 Truck, delivery, and tractor drivers 201 2.99
988 Laborers, except construction 201 2.99
994 Not specified service workers 201 2.99
196 Clerical and kindred workers 200 2.98
62 Machinists 101 1.50
74 Electricians 101 1.50
kable(occupation_population, digits = 2, caption = "Top 10 Occupations of the General Population (Age 10+)")
Top 10 Occupations of the General Population (Age 10+)
OCC Occupation n Percentage
999 N/A or None Reported 54506584 41.95
304 Bookkeepers and accounting clerks 6892037 5.30
12 Farmers (owners and tenants) 2207438 1.70
196 Clerical and kindred workers 1442506 1.11
994 Not specified service workers 685775 0.53
91 Truck, delivery, and tractor drivers 513244 0.40
988 Laborers, except construction 459118 0.35
46 Carpenters 208899 0.16
62 Machinists 205815 0.16
980 Operatives and kindred workers 97883 0.08

Industries

# Define a mapping of the specified IND1950 codes to industry labels
industry_labels <- c(
  "0"   = "N/A or None Reported",
  "105" = "Agriculture",
  "826" = "Private Households",
  "246" = "Construction",
  "506" = "Railroads and Railway Express Service",
  "997" = "Nonclassifiable",
  "636" = "Food Stores, Except Dairy Products",
  "888" = "Educational Services",
  "439" = "Yarn, Thread, and Fabric Mills",
  "216" = "Coal Mining"
)

# Filter data for individuals aged 10+
filtered_data <- ipums_data %>% filter(AGE >= 10)

# Compute industry distribution for the India-born group
industry_group <- filtered_data %>%
  filter(BPL == 521) %>%  # India is coded as 521
  count(IND1950, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Industry = industry_labels[as.character(IND1950)]  # Map only specified labels
  ) %>%
  filter(!is.na(Industry)) %>%  # Keep only specified industries
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(IND1950, Industry, n, Percentage)  # Keep only relevant columns

# Compute industry distribution for the general population
industry_population <- filtered_data %>%
  count(IND1950, wt = PERWT) %>%  # Use PERWT for weighted count
  mutate(
    Percentage = n / sum(n) * 100,
    Industry = industry_labels[as.character(IND1950)]  # Map only specified labels
  ) %>%
  filter(!is.na(Industry)) %>%  # Keep only specified industries
  arrange(desc(Percentage)) %>%
  head(10) %>%
  select(IND1950, Industry, n, Percentage)  # Keep only relevant columns

# Display tables
kable(industry_group, digits = 2, caption = "Top 10 Industries of India-Born Individuals (Age 10+)")
Top 10 Industries of India-Born Individuals (Age 10+)
IND1950 Industry n Percentage
0 N/A or None Reported 2805 41.77
105 Agriculture 1205 17.94
246 Construction 101 1.50
826 Private Households 100 1.49
888 Educational Services 100 1.49
kable(industry_population, digits = 2, caption = "Top 10 Industries of the General Population (Age 10+)")
Top 10 Industries of the General Population (Age 10+)
IND1950 Industry n Percentage
0 N/A or None Reported 64287504 49.48
105 Agriculture 22809334 17.55
826 Private Households 3872636 2.98
246 Construction 3304884 2.54
506 Railroads and Railway Express Service 2435376 1.87
997 Nonclassifiable 1849958 1.42
636 Food Stores, Except Dairy Products 1434992 1.10
888 Educational Services 1319729 1.02
439 Yarn, Thread, and Fabric Mills 1081252 0.83
216 Coal Mining 979816 0.75

Urban/Rural Comparison

# Define a mapping for URBAN codes (adjust based on dataset)
urban_labels <- c(
  "0" = "N/A",
  "1" = "Rural",
  "2" = "Urban"
)

# Function to process urban vs. rural data
process_urban_rural <- function(data) {
  data %>%
    group_by(YEAR, URBAN) %>%
    summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(
      URBAN = urban_labels[as.character(URBAN)],  # Map labels
      Total = sum(count),  # Compute total for each year
      Percentage = (count / Total) * 100  # Compute percentage
    ) %>%
    select(YEAR, URBAN, count, Percentage) %>%
    pivot_wider(names_from = YEAR, values_from = c(count, Percentage), names_glue = "{YEAR}_{.value}") %>%
    mutate(
      `1900` = ifelse(!is.na(`1900_count`), paste0(`1900_count`, " (", round(`1900_Percentage`, 2), "%)"), "N/A"),
      `1910` = ifelse(!is.na(`1910_count`), paste0(`1910_count`, " (", round(`1910_Percentage`, 2), "%)"), "N/A")
    ) %>%
    select(URBAN, `1900`, `1910`)  # Keep only relevant columns
}

# Ensure the general population data is defined
if (!exists("general_population_data")) {
  general_population_data <- ipums_data  # Use full dataset for everyone
}

# Process urban/rural data for India-born individuals
urban_rural_india <- india_born_data %>%
  process_urban_rural() %>%
  rename(`1900 India-Born` = `1900`, `1910 India-Born` = `1910`)

# Process urban/rural data for the general population
urban_rural_general <- general_population_data %>%
  process_urban_rural() %>%
  rename(`1900 General` = `1900`, `1910 General` = `1910`)

# Merge both datasets to create a single comparison table
urban_rural_comparison <- urban_rural_general %>%
  left_join(urban_rural_india, by = "URBAN")

# Display comparison table with kable
kable(urban_rural_comparison, digits = 2, caption = "Comparison of Urban/Rural Distribution Between India-Born and General Population (1900 & 1910)")
Comparison of Urban/Rural Distribution Between India-Born and General Population (1900 & 1910)
URBAN 1900 General 1910 General 1900 India-Born 1910 India-Born
Rural 46638181 (27.69%) 50952346 (30.25%) 400 (5.62%) 2006 (28.18%)
Urban 29429355 (17.47%) 41393058 (24.58%) 1604 (22.53%) 3108 (43.66%)

Nativity

# Define a mapping for NATIVITY codes
nativity_labels <- c(
  "0" = "N/A or Unknown",
  "1" = "Both Parents Native-Born",
  "2" = "Father Foreign, Mother Native",
  "3" = "Mother Foreign, Father Native",
  "4" = "Both Parents Foreign",
  "5" = "Foreign-Born"
)

# Function to process nativity data
process_nativity <- function(data) {
  data %>%
    group_by(YEAR, NATIVITY) %>%
    summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(
      NATIVITY = nativity_labels[as.character(NATIVITY)],  # Map labels
      Total = sum(count),  # Compute total for each year
      Percentage = (count / Total) * 100  # Compute percentage
    ) %>%
    select(YEAR, NATIVITY, count, Percentage) %>%
    pivot_wider(names_from = YEAR, values_from = c(count, Percentage), names_glue = "{YEAR}_{.value}") %>%
    mutate(
      `1900` = ifelse(!is.na(`1900_count`), paste0(`1900_count`, " (", round(`1900_Percentage`, 2), "%)"), "N/A"),
      `1910` = ifelse(!is.na(`1910_count`), paste0(`1910_count`, " (", round(`1910_Percentage`, 2), "%)"), "N/A")
    ) %>%
    select(NATIVITY, `1900`, `1910`)  # Keep only relevant columns
}

# Ensure the general population data is defined
if (!exists("general_population_data")) {
  general_population_data <- ipums_data  # Use full dataset for everyone
}

# Process nativity data for India-born individuals
nativity_india <- india_born_data %>%
  process_nativity() %>%
  rename(`1900 India-Born` = `1900`, `1910 India-Born` = `1910`)

# Process nativity data for the general population
nativity_general <- general_population_data %>%
  process_nativity() %>%
  rename(`1900  General` = `1900`, `1910 General` = `1910`)

# Merge both datasets to create a single comparison table
nativity_comparison <- nativity_general %>%
  left_join(nativity_india, by = "NATIVITY")

# Display comparison table with kable
kable(nativity_comparison, digits = 2, caption = "Comparison of Nativity Between India-Born and General Population (1900 & 1910)")
Comparison of Nativity Between India-Born and General Population (1900 & 1910)
NATIVITY 1900 General 1910 General 1900 India-Born 1910 India-Born
Both Parents Native-Born 49808238 (29.58%) 59242530 (35.18%) NA NA
Father Foreign, Mother Native 3496849 (2.08%) 4234224 (2.51%) NA NA
Mother Foreign, Father Native 1673298 (0.99%) 2227348 (1.32%) NA NA
Both Parents Foreign 10605032 (6.3%) 12971464 (7.7%) NA NA
Foreign-Born 10484119 (6.23%) 13669838 (8.12%) 2004 (28.15%) 5114 (71.85%)

Geographic Distribution (For Mapping)

geo_distribution_data <- ipums_data %>%
  group_by(STATEFIP) %>%
  summarise(
    total_state_population = sum(PERWT, na.rm = TRUE),  # Total weighted population in the state
    india_population = sum(if_else(india_born == 1, PERWT, 0), na.rm = TRUE),  # India-born weighted population
    india_prop_of_group = india_population / sum(india_population),  # Share of total India-born in each state
    india_prop_in_state = india_population / total_state_population  # % of state's population that is India-born
  ) %>%
  ungroup()

# Convert `STATEFIP` to full state names for clarity
geo_distribution_data <- geo_distribution_data %>%
  mutate(State = state.name[match(STATEFIP, state.abb)])

Mapping

geo_distribution_data <- geo_distribution_data %>%
  mutate(fips = as.numeric(STATEFIP))

map_india_population <- plot_usmap(data = geo_distribution_data, values = "india_population", regions = "states") +
  scale_fill_continuous(name = "India-born Pop.", low = "lightblue", high = "darkblue", label = scales::comma) +
  labs(title = "Number of India-born Individuals by State (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher number of India-born residents") +
  theme_minimal()

map_india_share <- plot_usmap(data = geo_distribution_data, values = "india_prop_of_group", regions = "states") +
  scale_fill_continuous(name = "Share of India-born (%)", low = "lightgreen", high = "darkgreen", label = scales::percent) +
  labs(title = "Proportion of India-born Individuals in Each State (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher proportion of the total India-born population") +
  theme_minimal()

map_india_relative <- plot_usmap(data = geo_distribution_data, values = "india_prop_in_state", regions = "states") +
  scale_fill_continuous(name = "India-born as % of State Pop.", low = "lightpink", high = "darkred", label = scales::percent) +
  labs(title = "Proportion of State Population That is India-born (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher proportion of India-born residents in the state population") +
  theme_minimal()

map_india_population

map_india_share

map_india_relative