remove()  #Remove objects from the workspace
 rm(list=ls())  #removes all objects from the current workspace (R memory)

Cohort analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
data_usa<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment3\\USA.csv")
## Rows: 3859 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Area, Sex, Age
## dbl (2): Year, Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_usa<-rename_with(data_usa, tolower) 
str(data_usa)
## spc_tbl_ [3,859 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ year      : num [1:3859] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area      : chr [1:3859] "Total" "Total" "Total" "Total" ...
##  $ sex       : chr [1:3859] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age       : chr [1:3859] "Total" "0" "0 - 4" "1" ...
##  $ population: num [1:3859] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Year = col_double(),
##   ..   Area = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_character(),
##   ..   Population = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
data_usa1 <- subset(data_usa,  area == "Total" & sex=="Both Sexes")
str(data_usa1)
## tibble [506 × 5] (S3: tbl_df/tbl/data.frame)
##  $ year      : num [1:506] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area      : chr [1:506] "Total" "Total" "Total" "Total" ...
##  $ sex       : chr [1:506] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age       : chr [1:506] "Total" "0" "0 - 4" "1" ...
##  $ population: num [1:506] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
data_usa2 <- data_usa1 %>%
  mutate(
    age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
names(data_usa2)
## [1] "year"       "area"       "sex"        "age"        "population"
str(data_usa2)
## tibble [506 × 5] (S3: tbl_df/tbl/data.frame)
##  $ year      : num [1:506] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area      : chr [1:506] "Total" "Total" "Total" "Total" ...
##  $ sex       : chr [1:506] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age       : num [1:506] NA 0 NA 1 2 3 4 5 NA 6 ...
##  $ population: num [1:506] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
data_usa2 %>%
  filter(age >= 0 & age <= 100)
## # A tibble: 375 × 5
##     year area  sex          age population
##    <dbl> <chr> <chr>      <dbl>      <dbl>
##  1  2010 Total Both Sexes     0    3944153
##  2  2010 Total Both Sexes     1    3978070
##  3  2010 Total Both Sexes     2    4096929
##  4  2010 Total Both Sexes     3    4119040
##  5  2010 Total Both Sexes     4    4063170
##  6  2010 Total Both Sexes     5    4056858
##  7  2010 Total Both Sexes     6    4066381
##  8  2010 Total Both Sexes     7    4030579
##  9  2010 Total Both Sexes     8    4046486
## 10  2010 Total Both Sexes     9    4148353
## # ℹ 365 more rows
str(data_usa2)
## tibble [506 × 5] (S3: tbl_df/tbl/data.frame)
##  $ year      : num [1:506] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area      : chr [1:506] "Total" "Total" "Total" "Total" ...
##  $ sex       : chr [1:506] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age       : num [1:506] NA 0 NA 1 2 3 4 5 NA 6 ...
##  $ population: num [1:506] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
library(tidyr)
library(dplyr)
data_usa2 <- data_usa2 %>% drop_na()

Graph

# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
# Step 1: Calculate birth cohort
data_usa2<- data_usa2 %>%
  mutate(cohort = year - age)

# Step 2: Plot cohort analysis with smoother lines, inverse x-axis order, and formatted population
ggplot(data_usa2, aes(x = cohort, y = population, color = factor(year), group = factor(year))) +
  geom_line(size = 1) +
  geom_point(size = 1)+  # Smooth lines using loess
  scale_x_reverse(breaks = seq(1900, 2010, by = 10), labels = paste0(seq(1900, 2010, by = 10), "-", seq(1895, 2005, by = 10))) +  # Inverse x-axis order
  scale_y_continuous(labels = comma) +  # Format population with commas
  scale_color_manual(values = c("blue", "cyan", "yellow", "purple")) +  # Colors for different census years
  labs(title = "Cohort Analysis, United States, 1980-2010",
       x = "Birth Cohort",
       y = "Population",
       color = "Census Year") +
  theme_minimal() +
  theme(legend.position = "bottom", 
        axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Distribution of age

library(dplyr)
library(readr)
data1<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment3\\Assignment 3.csv")
## Rows: 3858 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Country or Area, Area, Sex, Age
## dbl (2): Year, Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1<-rename_with(data1, tolower) 
str(data1)
## spc_tbl_ [3,858 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ country or area: chr [1:3858] "United States of America" "United States of America" "United States of America" "United States of America" ...
##  $ year           : num [1:3858] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area           : chr [1:3858] "Total" "Total" "Total" "Total" ...
##  $ sex            : chr [1:3858] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age            : chr [1:3858] "Total" "0" "0 - 4" "1" ...
##  $ value          : num [1:3858] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Country or Area` = col_character(),
##   ..   Year = col_double(),
##   ..   Area = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_character(),
##   ..   Value = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
data2 <- subset(data1, year == 2010 & area == "Total" & sex %in% c("Male", "Female"))
str(data2)
## tibble [244 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country or area: chr [1:244] "United States of America" "United States of America" "United States of America" "United States of America" ...
##  $ year           : num [1:244] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area           : chr [1:244] "Total" "Total" "Total" "Total" ...
##  $ sex            : chr [1:244] "Male" "Male" "Male" "Male" ...
##  $ age            : chr [1:244] "Total" "0" "0 - 4" "1" ...
##  $ value          : num [1:244] 1.52e+08 2.01e+06 1.03e+07 2.03e+06 2.09e+06 ...
data3 <- data2 %>%
  mutate(
    age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
names(data3)
## [1] "country or area" "year"            "area"            "sex"            
## [5] "age"             "value"
# Load the necessary packages
library(dplyr)
library(tidyr)

# Use drop_na to remove rows with NA values
data4 <- data3 %>% drop_na()
# Assuming you want to assign the filtered data to a new variable data4
data4f <- data4 %>%
  filter(age >= 1 & age <= 89)

# View the structure of the filtered data
str(data4f)
## tibble [178 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country or area: chr [1:178] "United States of America" "United States of America" "United States of America" "United States of America" ...
##  $ year           : num [1:178] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
##  $ area           : chr [1:178] "Total" "Total" "Total" "Total" ...
##  $ sex            : chr [1:178] "Male" "Male" "Male" "Male" ...
##  $ age            : num [1:178] 1 2 3 4 5 6 7 8 9 10 ...
##  $ value          : num [1:178] 2030853 2092198 2104550 2077550 2072094 ...

Median age

combined_data <- data4f %>%
  group_by(age) %>%
  summarise(total_population = sum(value))
mean_age <- weighted.mean(combined_data$age, combined_data$total_population)
cat("Mean Age:", mean_age, "\n")
## Mean Age: 37.4545
# Calculate weighted median age
# Function for weighted median
weighted_median <- function(x, w) {
  # Sort by x
  sorted_data <- data.frame(x = x, w = w)[order(x),]
  cumulative_weight <- cumsum(sorted_data$w)
  total_weight <- sum(sorted_data$w)
  median_index <- which(cumulative_weight >= total_weight / 2)[1]
  return(sorted_data$x[median_index])
}

median_age <- weighted_median(combined_data$age, combined_data$total_population)
cat("Median Age:", median_age, "\n")
## Median Age: 37

Population Pyramid 1

library(ggplot2)
library(scales)  # For number formatting

# Convert male population to negative values and keep female population positive
data4g <- data4f %>%
  mutate(value = ifelse(sex == "Male", -value, value))

# Create the population pyramid
ggplot(data4g, aes(x = age, y = value, fill = sex)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = function(x) comma(abs(x)), name = "Population") +  # Display positive values on the y-axis
  labs(x = "Age", title ="Population distribution by Gender and Single-Year Age Group,United States,2010") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

### 1980 census

data1980 <- subset(data1, year == 1980 & area == "Rural" & sex %in% c("Male", "Female"))
str(data1980)
## tibble [210 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country or area: chr [1:210] "United States of America" "United States of America" "United States of America" "United States of America" ...
##  $ year           : num [1:210] 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
##  $ area           : chr [1:210] "Rural" "Rural" "Rural" "Rural" ...
##  $ sex            : chr [1:210] "Male" "Male" "Male" "Male" ...
##  $ age            : chr [1:210] "Total" "0" "1" "45295" ...
##  $ value          : num [1:210] 29760870 491759 465437 1848667 464548 ...
data1980a <- data1980 %>%
  mutate(age = ifelse(age == "85 +", "85", age),age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
names(data1980a)
## [1] "country or area" "year"            "area"            "sex"            
## [5] "age"             "value"
# Load the necessary packages
library(dplyr)
library(tidyr)

# Use drop_na to remove rows with NA values
data1980b <- data1980a %>% drop_na()
# Assuming you want to assign the filtered data to a new variable data4
data1980c <- data1980b %>%
  filter(age >= 1 & age <= 85)

# View the structure of the filtered data
str(data1980c)
## tibble [170 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country or area: chr [1:170] "United States of America" "United States of America" "United States of America" "United States of America" ...
##  $ year           : num [1:170] 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
##  $ area           : chr [1:170] "Rural" "Rural" "Rural" "Rural" ...
##  $ sex            : chr [1:170] "Male" "Male" "Male" "Male" ...
##  $ age            : num [1:170] 1 2 3 4 5 6 7 8 9 10 ...
##  $ value          : num [1:170] 465437 464548 460762 457920 467849 ...

Median age

combined_data1 <- data1980c %>%
  group_by(age) %>%
  summarise(total_population = sum(value))
mean_age <- weighted.mean(combined_data1$age, combined_data1$total_population)
cat("Mean Age:", mean_age, "\n")
## Mean Age: 33.52052
# Calculate weighted median age
# Function for weighted median
weighted_median <- function(x, w) {
  # Sort by x
  sorted_data <- data.frame(x = x, w = w)[order(x),]
  cumulative_weight <- cumsum(sorted_data$w)
  total_weight <- sum(sorted_data$w)
  median_index <- which(cumulative_weight >= total_weight / 2)[1]
  return(sorted_data$x[median_index])
}

median_age <- weighted_median(combined_data1$age, combined_data1$total_population)
cat("Median Age:", median_age, "\n")
## Median Age: 30

Population Pyramid 2

library(ggplot2)
library(scales)  # For number formatting

# Convert male population to negative values and keep female population positive
data1980d <- data1980c %>%
  mutate(value = ifelse(sex == "Male", -value, value))

# Create the population pyramid
ggplot(data1980d, aes(x = age, y = value, fill = sex)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = function(x) comma(abs(x)), name = "Population") +  # Display positive values on the y-axis
  labs(x = "Age", title = "Rural Population by Gender and Single-Year Age Group, United States:1980") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

INdicator for age reporting errors

# Step 1: Extract the last digit of each age
data4f$last_digit <- data4f$age %% 10

# Step 2: Calculate the total population
total_population_meyer <- sum(data4f$value)

# Step 3: Calculate observed proportions for each last digit (0-9)
observed_proportions <- data4f %>%
  group_by(last_digit) %>%
  summarise(observed_population = sum(value)) %>%
  mutate(observed_proportion = observed_population / total_population_meyer)

# Step 4: Calculate Meyer's Blended Index
expected_proportion <- 0.1  # Expected proportion for each last digit in a uniform distribution
meyers_blended_index <- sum(abs(observed_proportions$observed_proportion - expected_proportion)) * 50

# Display the Meyer's Blended Index
meyers_blended_index
## [1] 1.219763

Congo

library(dplyr)
library(readr)
data_congo<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment3\\Congo data.csv")
## Rows: 330 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sex, Age
## dbl (2): Year, Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_congo<-rename_with(data_congo, tolower) 
str(data_congo)
## spc_tbl_ [330 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ year : num [1:330] 2007 2007 2007 2007 2007 ...
##  $ sex  : chr [1:330] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age  : chr [1:330] "0" "1" "2" "3" ...
##  $ value: num [1:330] 125753 106819 104648 117109 103171 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Year = col_double(),
##   ..   Sex = col_character(),
##   ..   Age = col_character(),
##   ..   Value = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Load the necessary packages
library(dplyr)
library(tidyr)

# Use drop_na to remove rows with NA values
data_congo <- data_congo %>% drop_na()
str(data_congo)
## tibble [330 × 4] (S3: tbl_df/tbl/data.frame)
##  $ year : num [1:330] 2007 2007 2007 2007 2007 ...
##  $ sex  : chr [1:330] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age  : chr [1:330] "0" "1" "2" "3" ...
##  $ value: num [1:330] 125753 106819 104648 117109 103171 ...
data_congo1 <- subset(data_congo,sex %in% c("Male", "Female"))%>%
 filter(age >= 0 & age <= 89)%>%
mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
str(data_congo1)
## tibble [214 × 4] (S3: tbl_df/tbl/data.frame)
##  $ year : num [1:214] 2007 2007 2007 2007 2007 ...
##  $ sex  : chr [1:214] "Female" "Female" "Female" "Female" ...
##  $ age  : num [1:214] 0 1 2 3 4 5 6 7 8 10 ...
##  $ value: num [1:214] 63134 53219 52187 58738 51880 ...
data_congo2<-data_congo1%>%
filter(age >= 0 & age <= 89)
data_congo1 <- data_congo1 %>% drop_na()
str(data_congo2)
## tibble [178 × 4] (S3: tbl_df/tbl/data.frame)
##  $ year : num [1:178] 2007 2007 2007 2007 2007 ...
##  $ sex  : chr [1:178] "Female" "Female" "Female" "Female" ...
##  $ age  : num [1:178] 0 1 2 3 4 5 6 7 8 10 ...
##  $ value: num [1:178] 63134 53219 52187 58738 51880 ...
library(ggplot2)
library(scales)  # For number formatting

# Convert male population to negative values and keep female population positive
data_congof <- data_congo2 %>%
  mutate(value = ifelse(sex == "Male", -value, value))

# Create the population pyramid
ggplot(data_congof, aes(x = age, y = value, fill = sex)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates for pyramid style
  scale_y_continuous(labels = function(x) comma(abs(x)), name = "Population", breaks = seq(-60000, 60000, by = 20000)) +  # Display positive values on y-axis
  scale_x_continuous(breaks = seq(0, 90, by = 5), name = "Age") +  # Age groups (0-85) on the x-axis
  labs(title ="Population Distribution by Gender and Single-Year Age Group, Congo, 2007") +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
  theme_minimal()

data_bothsexes <- subset(data_congo,sex=="Both Sexes")%>%
mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
str(data_bothsexes)
## tibble [110 × 4] (S3: tbl_df/tbl/data.frame)
##  $ year : num [1:110] 2007 2007 2007 2007 2007 ...
##  $ sex  : chr [1:110] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age  : num [1:110] 0 1 2 3 4 5 6 7 8 9 ...
##  $ value: num [1:110] 125753 106819 104648 117109 103171 ...
data_bothsexes1<-data_bothsexes%>%
filter(age >= 0 & age <= 89)
data_bothsexes1 <- data_bothsexes1 %>% drop_na()
str(data_bothsexes1)
## tibble [90 × 4] (S3: tbl_df/tbl/data.frame)
##  $ year : num [1:90] 2007 2007 2007 2007 2007 ...
##  $ sex  : chr [1:90] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ age  : num [1:90] 0 1 2 3 4 5 6 7 8 9 ...
##  $ value: num [1:90] 125753 106819 104648 117109 103171 ...
mean_age <- weighted.mean(data_bothsexes1$age, data_bothsexes1$value)
cat("Mean Age:", mean_age, "\n")
## Mean Age: 23.29439
weighted_median <- function(x, w) {
  # Sort by x
  sorted_data <- data.frame(x = x, w = w)[order(x),]
  cumulative_weight <- cumsum(sorted_data$w)
  total_weight <- sum(sorted_data$w)
  median_index <- which(cumulative_weight >= total_weight / 2)[1]
  return(sorted_data$x[median_index])
}

median_age <- weighted_median(data_bothsexes1$age, data_bothsexes1$value)
cat("Median Age:", median_age, "\n")
## Median Age: 20
# Step 1: Extract the last digit of each age
data_bothsexes1$last_digit <- data_bothsexes1$age %% 10

# Step 2: Calculate the total population
total_population_meyer <- sum(data_bothsexes1$value)

# Step 3: Calculate observed proportions for each last digit (0-9)
observed_proportions <- data_bothsexes1 %>%
  group_by(last_digit) %>%
  summarise(observed_population = sum(value)) %>%
  mutate(observed_proportion = observed_population / total_population_meyer)

# Step 4: Calculate Meyer's Blended Index
expected_proportion <- 0.1  # Expected proportion for each last digit in a uniform distribution
meyers_blended_index <- sum(abs(observed_proportions$observed_proportion - expected_proportion)) * 50

# Display the Meyer's Blended Index
meyers_blended_index
## [1] 4.129311
library(ggplot2)
library(dplyr)

# Create the moving average function
moving_average <- function(x, n = 3) {
  stats::filter(x, rep(1/n, n), sides = 2)
}

# Apply moving average to the population data
data_bothsexes1 <- data_bothsexes1 %>%
  arrange(age)  # Ensure the data is ordered by age

# Apply the moving average with a window of 5 (or adjust as needed)
data_bothsexes1 <- data_bothsexes1 %>%
  mutate(smoothed_population = moving_average(value, n = 5))

# Remove NA values caused by edge effects from smoothing
data_bothsexes1 <- na.omit(data_bothsexes1)

# Plot the original and smoothed population distributions along with a linear trend
ggplot(data_bothsexes1, aes(x = age)) +
  geom_line(aes(y = value, color = "Total Population Estimate"), size = 1) +  # Original population
  geom_line(aes(y = smoothed_population, color = "'Smoothed' by Moving Average"), 
            linetype = "solid", size = 1) +  # Smoothed population, thicker line
  geom_smooth(aes(y = value, color = "Linear (Total Population Estimate)"), 
              method = "lm", se = FALSE, linetype = "longdash", size = 1) +  # Linear trend line
  scale_color_manual(values = c("Total Population Estimate" = "blue",
                                "'Smoothed' by Moving Average" = "red",
                                "Linear (Total Population Estimate)" = "purple")) +
  labs(title = "Population of Congo, by Single Years of Age: 2007",
       x = "Age",
       y = "Total Population Estimate",
       color = "Legend") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_y_continuous(labels = scales::comma)  # Use comma for formatting population numbers
## `geom_smooth()` using formula = 'y ~ x'