remove() #Remove objects from the workspace
rm(list=ls()) #removes all objects from the current workspace (R memory)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
data_usa<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment3\\USA.csv")
## Rows: 3859 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Area, Sex, Age
## dbl (2): Year, Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_usa<-rename_with(data_usa, tolower)
str(data_usa)
## spc_tbl_ [3,859 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ year : num [1:3859] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:3859] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:3859] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : chr [1:3859] "Total" "0" "0 - 4" "1" ...
## $ population: num [1:3859] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. Area = col_character(),
## .. Sex = col_character(),
## .. Age = col_character(),
## .. Population = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
data_usa1 <- subset(data_usa, area == "Total" & sex=="Both Sexes")
str(data_usa1)
## tibble [506 × 5] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:506] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:506] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:506] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : chr [1:506] "Total" "0" "0 - 4" "1" ...
## $ population: num [1:506] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
data_usa2 <- data_usa1 %>%
mutate(
age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
names(data_usa2)
## [1] "year" "area" "sex" "age" "population"
str(data_usa2)
## tibble [506 × 5] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:506] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:506] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:506] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : num [1:506] NA 0 NA 1 2 3 4 5 NA 6 ...
## $ population: num [1:506] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
data_usa2 %>%
filter(age >= 0 & age <= 100)
## # A tibble: 375 × 5
## year area sex age population
## <dbl> <chr> <chr> <dbl> <dbl>
## 1 2010 Total Both Sexes 0 3944153
## 2 2010 Total Both Sexes 1 3978070
## 3 2010 Total Both Sexes 2 4096929
## 4 2010 Total Both Sexes 3 4119040
## 5 2010 Total Both Sexes 4 4063170
## 6 2010 Total Both Sexes 5 4056858
## 7 2010 Total Both Sexes 6 4066381
## 8 2010 Total Both Sexes 7 4030579
## 9 2010 Total Both Sexes 8 4046486
## 10 2010 Total Both Sexes 9 4148353
## # ℹ 365 more rows
str(data_usa2)
## tibble [506 × 5] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:506] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:506] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:506] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : num [1:506] NA 0 NA 1 2 3 4 5 NA 6 ...
## $ population: num [1:506] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
library(tidyr)
library(dplyr)
data_usa2 <- data_usa2 %>% drop_na()
# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
# Step 1: Calculate birth cohort
data_usa2<- data_usa2 %>%
mutate(cohort = year - age)
# Step 2: Plot cohort analysis with smoother lines, inverse x-axis order, and formatted population
ggplot(data_usa2, aes(x = cohort, y = population, color = factor(year), group = factor(year))) +
geom_line(size = 1) +
geom_point(size = 1)+ # Smooth lines using loess
scale_x_reverse(breaks = seq(1900, 2010, by = 10), labels = paste0(seq(1900, 2010, by = 10), "-", seq(1895, 2005, by = 10))) + # Inverse x-axis order
scale_y_continuous(labels = comma) + # Format population with commas
scale_color_manual(values = c("blue", "cyan", "yellow", "purple")) + # Colors for different census years
labs(title = "Cohort Analysis, United States, 1980-2010",
x = "Birth Cohort",
y = "Population",
color = "Census Year") +
theme_minimal() +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(dplyr)
library(readr)
data1<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment3\\Assignment 3.csv")
## Rows: 3858 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Country or Area, Area, Sex, Age
## dbl (2): Year, Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data1<-rename_with(data1, tolower)
str(data1)
## spc_tbl_ [3,858 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ country or area: chr [1:3858] "United States of America" "United States of America" "United States of America" "United States of America" ...
## $ year : num [1:3858] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:3858] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:3858] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : chr [1:3858] "Total" "0" "0 - 4" "1" ...
## $ value : num [1:3858] 3.09e+08 3.94e+06 2.02e+07 3.98e+06 4.10e+06 ...
## - attr(*, "spec")=
## .. cols(
## .. `Country or Area` = col_character(),
## .. Year = col_double(),
## .. Area = col_character(),
## .. Sex = col_character(),
## .. Age = col_character(),
## .. Value = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
data2 <- subset(data1, year == 2010 & area == "Total" & sex %in% c("Male", "Female"))
str(data2)
## tibble [244 × 6] (S3: tbl_df/tbl/data.frame)
## $ country or area: chr [1:244] "United States of America" "United States of America" "United States of America" "United States of America" ...
## $ year : num [1:244] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:244] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:244] "Male" "Male" "Male" "Male" ...
## $ age : chr [1:244] "Total" "0" "0 - 4" "1" ...
## $ value : num [1:244] 1.52e+08 2.01e+06 1.03e+07 2.03e+06 2.09e+06 ...
data3 <- data2 %>%
mutate(
age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
names(data3)
## [1] "country or area" "year" "area" "sex"
## [5] "age" "value"
# Load the necessary packages
library(dplyr)
library(tidyr)
# Use drop_na to remove rows with NA values
data4 <- data3 %>% drop_na()
# Assuming you want to assign the filtered data to a new variable data4
data4f <- data4 %>%
filter(age >= 1 & age <= 89)
# View the structure of the filtered data
str(data4f)
## tibble [178 × 6] (S3: tbl_df/tbl/data.frame)
## $ country or area: chr [1:178] "United States of America" "United States of America" "United States of America" "United States of America" ...
## $ year : num [1:178] 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
## $ area : chr [1:178] "Total" "Total" "Total" "Total" ...
## $ sex : chr [1:178] "Male" "Male" "Male" "Male" ...
## $ age : num [1:178] 1 2 3 4 5 6 7 8 9 10 ...
## $ value : num [1:178] 2030853 2092198 2104550 2077550 2072094 ...
combined_data <- data4f %>%
group_by(age) %>%
summarise(total_population = sum(value))
mean_age <- weighted.mean(combined_data$age, combined_data$total_population)
cat("Mean Age:", mean_age, "\n")
## Mean Age: 37.4545
# Calculate weighted median age
# Function for weighted median
weighted_median <- function(x, w) {
# Sort by x
sorted_data <- data.frame(x = x, w = w)[order(x),]
cumulative_weight <- cumsum(sorted_data$w)
total_weight <- sum(sorted_data$w)
median_index <- which(cumulative_weight >= total_weight / 2)[1]
return(sorted_data$x[median_index])
}
median_age <- weighted_median(combined_data$age, combined_data$total_population)
cat("Median Age:", median_age, "\n")
## Median Age: 37
library(ggplot2)
library(scales) # For number formatting
# Convert male population to negative values and keep female population positive
data4g <- data4f %>%
mutate(value = ifelse(sex == "Male", -value, value))
# Create the population pyramid
ggplot(data4g, aes(x = age, y = value, fill = sex)) +
geom_bar(stat = "identity") +
coord_flip() + # Flip coordinates for pyramid style
scale_y_continuous(labels = function(x) comma(abs(x)), name = "Population") + # Display positive values on the y-axis
labs(x = "Age", title ="Population distribution by Gender and Single-Year Age Group,United States,2010") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal()
### 1980 census
data1980 <- subset(data1, year == 1980 & area == "Rural" & sex %in% c("Male", "Female"))
str(data1980)
## tibble [210 × 6] (S3: tbl_df/tbl/data.frame)
## $ country or area: chr [1:210] "United States of America" "United States of America" "United States of America" "United States of America" ...
## $ year : num [1:210] 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
## $ area : chr [1:210] "Rural" "Rural" "Rural" "Rural" ...
## $ sex : chr [1:210] "Male" "Male" "Male" "Male" ...
## $ age : chr [1:210] "Total" "0" "1" "45295" ...
## $ value : num [1:210] 29760870 491759 465437 1848667 464548 ...
data1980a <- data1980 %>%
mutate(age = ifelse(age == "85 +", "85", age),age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
names(data1980a)
## [1] "country or area" "year" "area" "sex"
## [5] "age" "value"
# Load the necessary packages
library(dplyr)
library(tidyr)
# Use drop_na to remove rows with NA values
data1980b <- data1980a %>% drop_na()
# Assuming you want to assign the filtered data to a new variable data4
data1980c <- data1980b %>%
filter(age >= 1 & age <= 85)
# View the structure of the filtered data
str(data1980c)
## tibble [170 × 6] (S3: tbl_df/tbl/data.frame)
## $ country or area: chr [1:170] "United States of America" "United States of America" "United States of America" "United States of America" ...
## $ year : num [1:170] 1980 1980 1980 1980 1980 1980 1980 1980 1980 1980 ...
## $ area : chr [1:170] "Rural" "Rural" "Rural" "Rural" ...
## $ sex : chr [1:170] "Male" "Male" "Male" "Male" ...
## $ age : num [1:170] 1 2 3 4 5 6 7 8 9 10 ...
## $ value : num [1:170] 465437 464548 460762 457920 467849 ...
combined_data1 <- data1980c %>%
group_by(age) %>%
summarise(total_population = sum(value))
mean_age <- weighted.mean(combined_data1$age, combined_data1$total_population)
cat("Mean Age:", mean_age, "\n")
## Mean Age: 33.52052
# Calculate weighted median age
# Function for weighted median
weighted_median <- function(x, w) {
# Sort by x
sorted_data <- data.frame(x = x, w = w)[order(x),]
cumulative_weight <- cumsum(sorted_data$w)
total_weight <- sum(sorted_data$w)
median_index <- which(cumulative_weight >= total_weight / 2)[1]
return(sorted_data$x[median_index])
}
median_age <- weighted_median(combined_data1$age, combined_data1$total_population)
cat("Median Age:", median_age, "\n")
## Median Age: 30
library(ggplot2)
library(scales) # For number formatting
# Convert male population to negative values and keep female population positive
data1980d <- data1980c %>%
mutate(value = ifelse(sex == "Male", -value, value))
# Create the population pyramid
ggplot(data1980d, aes(x = age, y = value, fill = sex)) +
geom_bar(stat = "identity") +
coord_flip() + # Flip coordinates for pyramid style
scale_y_continuous(labels = function(x) comma(abs(x)), name = "Population") + # Display positive values on the y-axis
labs(x = "Age", title = "Rural Population by Gender and Single-Year Age Group, United States:1980") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal()
# Step 1: Extract the last digit of each age
data4f$last_digit <- data4f$age %% 10
# Step 2: Calculate the total population
total_population_meyer <- sum(data4f$value)
# Step 3: Calculate observed proportions for each last digit (0-9)
observed_proportions <- data4f %>%
group_by(last_digit) %>%
summarise(observed_population = sum(value)) %>%
mutate(observed_proportion = observed_population / total_population_meyer)
# Step 4: Calculate Meyer's Blended Index
expected_proportion <- 0.1 # Expected proportion for each last digit in a uniform distribution
meyers_blended_index <- sum(abs(observed_proportions$observed_proportion - expected_proportion)) * 50
# Display the Meyer's Blended Index
meyers_blended_index
## [1] 1.219763
library(dplyr)
library(readr)
data_congo<-read_csv("C:\\Users\\anami\\OneDrive\\Documents\\DEM\\Assignment3\\Congo data.csv")
## Rows: 330 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Sex, Age
## dbl (2): Year, Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_congo<-rename_with(data_congo, tolower)
str(data_congo)
## spc_tbl_ [330 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ year : num [1:330] 2007 2007 2007 2007 2007 ...
## $ sex : chr [1:330] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : chr [1:330] "0" "1" "2" "3" ...
## $ value: num [1:330] 125753 106819 104648 117109 103171 ...
## - attr(*, "spec")=
## .. cols(
## .. Year = col_double(),
## .. Sex = col_character(),
## .. Age = col_character(),
## .. Value = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Load the necessary packages
library(dplyr)
library(tidyr)
# Use drop_na to remove rows with NA values
data_congo <- data_congo %>% drop_na()
str(data_congo)
## tibble [330 × 4] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:330] 2007 2007 2007 2007 2007 ...
## $ sex : chr [1:330] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : chr [1:330] "0" "1" "2" "3" ...
## $ value: num [1:330] 125753 106819 104648 117109 103171 ...
data_congo1 <- subset(data_congo,sex %in% c("Male", "Female"))%>%
filter(age >= 0 & age <= 89)%>%
mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
str(data_congo1)
## tibble [214 × 4] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:214] 2007 2007 2007 2007 2007 ...
## $ sex : chr [1:214] "Female" "Female" "Female" "Female" ...
## $ age : num [1:214] 0 1 2 3 4 5 6 7 8 10 ...
## $ value: num [1:214] 63134 53219 52187 58738 51880 ...
data_congo2<-data_congo1%>%
filter(age >= 0 & age <= 89)
data_congo1 <- data_congo1 %>% drop_na()
str(data_congo2)
## tibble [178 × 4] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:178] 2007 2007 2007 2007 2007 ...
## $ sex : chr [1:178] "Female" "Female" "Female" "Female" ...
## $ age : num [1:178] 0 1 2 3 4 5 6 7 8 10 ...
## $ value: num [1:178] 63134 53219 52187 58738 51880 ...
library(ggplot2)
library(scales) # For number formatting
# Convert male population to negative values and keep female population positive
data_congof <- data_congo2 %>%
mutate(value = ifelse(sex == "Male", -value, value))
# Create the population pyramid
ggplot(data_congof, aes(x = age, y = value, fill = sex)) +
geom_bar(stat = "identity") +
coord_flip() + # Flip coordinates for pyramid style
scale_y_continuous(labels = function(x) comma(abs(x)), name = "Population", breaks = seq(-60000, 60000, by = 20000)) + # Display positive values on y-axis
scale_x_continuous(breaks = seq(0, 90, by = 5), name = "Age") + # Age groups (0-85) on the x-axis
labs(title ="Population Distribution by Gender and Single-Year Age Group, Congo, 2007") +
scale_fill_manual(values = c("Male" = "blue", "Female" = "red")) +
theme_minimal()
data_bothsexes <- subset(data_congo,sex=="Both Sexes")%>%
mutate(age = as.numeric(age))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `age = as.numeric(age)`.
## Caused by warning:
## ! NAs introduced by coercion
str(data_bothsexes)
## tibble [110 × 4] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:110] 2007 2007 2007 2007 2007 ...
## $ sex : chr [1:110] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : num [1:110] 0 1 2 3 4 5 6 7 8 9 ...
## $ value: num [1:110] 125753 106819 104648 117109 103171 ...
data_bothsexes1<-data_bothsexes%>%
filter(age >= 0 & age <= 89)
data_bothsexes1 <- data_bothsexes1 %>% drop_na()
str(data_bothsexes1)
## tibble [90 × 4] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:90] 2007 2007 2007 2007 2007 ...
## $ sex : chr [1:90] "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ age : num [1:90] 0 1 2 3 4 5 6 7 8 9 ...
## $ value: num [1:90] 125753 106819 104648 117109 103171 ...
mean_age <- weighted.mean(data_bothsexes1$age, data_bothsexes1$value)
cat("Mean Age:", mean_age, "\n")
## Mean Age: 23.29439
weighted_median <- function(x, w) {
# Sort by x
sorted_data <- data.frame(x = x, w = w)[order(x),]
cumulative_weight <- cumsum(sorted_data$w)
total_weight <- sum(sorted_data$w)
median_index <- which(cumulative_weight >= total_weight / 2)[1]
return(sorted_data$x[median_index])
}
median_age <- weighted_median(data_bothsexes1$age, data_bothsexes1$value)
cat("Median Age:", median_age, "\n")
## Median Age: 20
# Step 1: Extract the last digit of each age
data_bothsexes1$last_digit <- data_bothsexes1$age %% 10
# Step 2: Calculate the total population
total_population_meyer <- sum(data_bothsexes1$value)
# Step 3: Calculate observed proportions for each last digit (0-9)
observed_proportions <- data_bothsexes1 %>%
group_by(last_digit) %>%
summarise(observed_population = sum(value)) %>%
mutate(observed_proportion = observed_population / total_population_meyer)
# Step 4: Calculate Meyer's Blended Index
expected_proportion <- 0.1 # Expected proportion for each last digit in a uniform distribution
meyers_blended_index <- sum(abs(observed_proportions$observed_proportion - expected_proportion)) * 50
# Display the Meyer's Blended Index
meyers_blended_index
## [1] 4.129311
library(ggplot2)
library(dplyr)
# Create the moving average function
moving_average <- function(x, n = 3) {
stats::filter(x, rep(1/n, n), sides = 2)
}
# Apply moving average to the population data
data_bothsexes1 <- data_bothsexes1 %>%
arrange(age) # Ensure the data is ordered by age
# Apply the moving average with a window of 5 (or adjust as needed)
data_bothsexes1 <- data_bothsexes1 %>%
mutate(smoothed_population = moving_average(value, n = 5))
# Remove NA values caused by edge effects from smoothing
data_bothsexes1 <- na.omit(data_bothsexes1)
# Plot the original and smoothed population distributions along with a linear trend
ggplot(data_bothsexes1, aes(x = age)) +
geom_line(aes(y = value, color = "Total Population Estimate"), size = 1) + # Original population
geom_line(aes(y = smoothed_population, color = "'Smoothed' by Moving Average"),
linetype = "solid", size = 1) + # Smoothed population, thicker line
geom_smooth(aes(y = value, color = "Linear (Total Population Estimate)"),
method = "lm", se = FALSE, linetype = "longdash", size = 1) + # Linear trend line
scale_color_manual(values = c("Total Population Estimate" = "blue",
"'Smoothed' by Moving Average" = "red",
"Linear (Total Population Estimate)" = "purple")) +
labs(title = "Population of Congo, by Single Years of Age: 2007",
x = "Age",
y = "Total Population Estimate",
color = "Legend") +
theme_minimal() +
theme(legend.position = "bottom") +
scale_y_continuous(labels = scales::comma) # Use comma for formatting population numbers
## `geom_smooth()` using formula = 'y ~ x'