Assignment 3 Final

Loading Scripts

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ipumsr)
library(ggplot2)
library(usmap)
library(ggthemes)
library(ggpubr)
library(kableExtra)

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows
library(tidyr)

Setting up the Workspace

setwd("~/Desktop/econ code/assignment 3/")

Loading in IPUMS Data

ddi <- read_ipums_ddi("usa_00004.xml")
ipums_data <- read_ipums_micro(
  ddi,
  data_file = "/Users/zahrahussain/Desktop/Econ Code/Assignment 3/usa_00004.dat.gz"
)
Use of data from IPUMS USA is subject to conditions including that users should cite the data appropriately. Use command `ipums_conditions()` for more details.

Data Cleaning and Preparation

ipums_data <- ipums_data %>%
  mutate(
    YEAR = as.integer(YEAR),
    STATEFIP = as.character(STATEFIP),  # Convert to character for mapping
    AGE = as.integer(AGE),
    SEX = factor(SEX, labels = c("Male", "Female")),
    URBAN = as.character(URBAN),
    BPL = as.integer(BPL),  # Ensure Birthplace is numeric
    PERWT = as.numeric(PERWT)  # Ensure weight is numeric
  )

Specifying Social Group

india_birthplace_codes <- c(521)
ipums_data <- ipums_data %>%
  mutate(india_born = if_else(BPL %in% india_birthplace_codes, 1, 0))

Population Pyramids Preparation

For India-born data and overall populatio
prepare_pyramid_data <- function(df, group_label = "Group") {
  df %>%
    filter(!is.na(AGE), !is.na(SEX)) %>%
    group_by(AGE, SEX) %>%
    summarise(population = sum(PERWT, na.rm = TRUE), .groups = "drop") %>%
    mutate(group = group_label)
}

india_born_data <- ipums_data %>% filter(india_born == 1)
pyramid_india_born <- prepare_pyramid_data(india_born_data, "India-born")
pyramid_general_pop <- prepare_pyramid_data(ipums_data, "All")

Creating the Pyramids

plot_pyramid <- function(pyramid_df, title_str) {
  pyramid_df %>%
    mutate(population = if_else(SEX == "Male", -population, population)) %>%
    ggplot(aes(x = AGE, y = population, fill = SEX)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    scale_y_continuous(labels = abs) +
    scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
    labs(
      title = title_str,
      x = "Age",
      y = "Weighted Population",
      fill = "Sex"
    ) +
    theme_minimal()
}

pyramid_india_plot <- plot_pyramid(pyramid_india_born, "Population Pyramid: India-born (1900 & 1910)")
pyramid_general_plot <- plot_pyramid(pyramid_general_pop, "Population Pyramid: General Population (1900 & 1910)")

pyramid_india_plot

pyramid_general_plot

Labor force

labor_force_status <- india_born_data %>%
  group_by(YEAR, LABFORCE) %>%
  summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop")

kable(labor_force_status, caption = "Labor Force Distribution of India-Born Individuals") %>%
  kable_styling()
Labor Force Distribution of India-Born Individuals
YEAR LABFORCE count
1900 0 501
1900 1 901
1900 2 602
1910 0 502
1910 1 1103
1910 2 3509

Occupations

top_10_occupations <- india_born_data %>%
  count(OCC, wt = PERWT, sort = TRUE) %>%
  slice_max(order_by = n, n = 10)

kable(top_10_occupations, caption = "Top 10 Occupations of India-Born Individuals") %>%
  kable_styling()
Top 10 Occupations of India-Born Individuals
OCC n
999 2407
12 603
46 502
304 500
91 201
988 201
994 201
196 200
62 101
74 101
980 101

Urban/Rural Comparison

urban_vs_rural <- india_born_data %>%
  group_by(YEAR, URBAN) %>%
  summarise(count = sum(PERWT, na.rm = TRUE), .groups = "drop")

kable(urban_vs_rural, caption = "Urban/Rural Distribution of India-Born Individuals") %>%
  kable_styling()
Urban/Rural Distribution of India-Born Individuals
YEAR URBAN count
1900 1 400
1900 2 1604
1910 1 2006
1910 2 3108

Geographic Distribution (For Mapping)

geo_distribution_data <- ipums_data %>%
  group_by(STATEFIP) %>%
  summarise(
    total_state_population = sum(PERWT, na.rm = TRUE),  # Total weighted population in the state
    india_population = sum(if_else(india_born == 1, PERWT, 0), na.rm = TRUE),  # India-born weighted population
    india_prop_of_group = india_population / sum(india_population),  # Share of total India-born in each state
    india_prop_in_state = india_population / total_state_population  # % of state's population that is India-born
  ) %>%
  ungroup()

# Convert `STATEFIP` to full state names for clarity
geo_distribution_data <- geo_distribution_data %>%
  mutate(State = state.name[match(STATEFIP, state.abb)])

Mapping

geo_distribution_data <- geo_distribution_data %>%
  mutate(fips = as.numeric(STATEFIP))

map_india_population <- plot_usmap(data = geo_distribution_data, values = "india_population", regions = "states") +
  scale_fill_continuous(name = "India-born Pop.", low = "lightblue", high = "darkblue", label = scales::comma) +
  labs(title = "Number of India-born Individuals by State (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher number of India-born residents") +
  theme_minimal()

map_india_share <- plot_usmap(data = geo_distribution_data, values = "india_prop_of_group", regions = "states") +
  scale_fill_continuous(name = "Share of India-born (%)", low = "lightgreen", high = "darkgreen", label = scales::percent) +
  labs(title = "Proportion of India-born Individuals in Each State (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher proportion of the total India-born population") +
  theme_minimal()

map_india_relative <- plot_usmap(data = geo_distribution_data, values = "india_prop_in_state", regions = "states") +
  scale_fill_continuous(name = "India-born as % of State Pop.", low = "lightpink", high = "darkred", label = scales::percent) +
  labs(title = "Proportion of State Population That is India-born (1900 & 1910)",
       subtitle = "Source: IPUMS-USA",
       caption = "Darker states indicate a higher proportion of India-born residents in the state population") +
  theme_minimal()

map_india_population

map_india_share

map_india_relative