library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 4.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
population <- read.csv("population.csv")
dim(population)
## [1] 18944 3
The research question for this project is **How are the population growths over the years differ between China, India, United States, Indonesia, Pakistan?“** using the data set titled population.csv from https://ourworldindata.org/population-growth#explore-data-on-population-growth which consists of three columns titled”Entity”, “Year”, and “all years”. Entity are the country names, Year are the years ranging from 1950 to 2023. All years are the population values for a country at a specific year. The dimensions of the data set is 18944 x 3.
The data analysis being performed is descriptive analysis. The plot I created was a line graph with all 5 countries present to compare the growth of each country from 1950 to 2023. There are 2 functions: the first being search(population) which returns the population size at a specified year for a specified country. The second function is summary_of_country_population(country) which gives the summary (mean, median, sd, min, and max) of that country. I used these on all 5 countries. Both the summaries and line graph help answer the question and it compares each country with one another and understands the countries individually.
# Function 1: able to find the specific population based on the country and year inputted
search_population <- function(country, year){
# check if "country" exists in data set
if (!(country %in% population$CountryName) || !is.character(country)) {
cat("Error: Variable", country, "is not character or not found.\n")
return(NULL)
}
if (!(year %in% population$Year) || !is.numeric(year)) {
cat("Error: Variable", year, "is not numeric or not found.\n")
return(NULL)
}
population_size <- population |>
filter(CountryName == country, Year == year)
return(population_size)
}
# Function 2: create a summary (mean, median, standard deviation, min, and max) of a specific country's population
summary_of_country_population <- function(country){
# check if "country" exists in data set
if (!(country %in% population$CountryName) || !is.character(country)) {
cat("Error: Variable", country, "is not character or not found.\n")
return(NULL)
}
country_summary <- population |>
filter(CountryName == country) |>
summarise(
mean(PopulationSize),
median(PopulationSize),
sd(PopulationSize),
min(PopulationSize),
max(PopulationSize)
)
country_summary
}
# EDA (Exploratory Data Analysis)
population <- population |>
rename("CountryName" = "Entity",
"PopulationSize" = "all.years")
str(population)
## 'data.frame': 18944 obs. of 3 variables:
## $ CountryName : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 ...
## $ PopulationSize: num 7776180 7879343 7987784 8096703 8207954 ...
head(population)
## CountryName Year PopulationSize
## 1 Afghanistan 1950 7776180
## 2 Afghanistan 1951 7879343
## 3 Afghanistan 1952 7987784
## 4 Afghanistan 1953 8096703
## 5 Afghanistan 1954 8207954
## 6 Afghanistan 1955 8326981
colSums(is.na(population))
## CountryName Year PopulationSize
## 0 0 0
# Testing functions
search_population("Angola", 1975)
## CountryName Year PopulationSize
## 1 Angola 1975 6842952
summary_of_country_population("Bahamas")
## mean(PopulationSize) median(PopulationSize) sd(PopulationSize)
## 1 254237.9 259747 101518.3
## min(PopulationSize) max(PopulationSize)
## 1 83505 399452
# Using summary_of_country_population function on the countries in the research questionS
summary_of_country_population("China")
## mean(PopulationSize) median(PopulationSize) sd(PopulationSize)
## 1 1047520241 1086725526 289330306
## min(PopulationSize) max(PopulationSize)
## 1 544044355 1426437269
summary_of_country_population("India")
## mean(PopulationSize) median(PopulationSize) sd(PopulationSize)
## 1 840741103 799810623 346536579
## min(PopulationSize) max(PopulationSize)
## 1 346278818 1438069597
summary_of_country_population("United States")
## mean(PopulationSize) median(PopulationSize) sd(PopulationSize)
## 1 248541443 245057550 56156125
## min(PopulationSize) max(PopulationSize)
## 1 154202683 343477330
summary_of_country_population("Indonesia")
## mean(PopulationSize) median(PopulationSize) sd(PopulationSize)
## 1 171739843 171843111 67000150
## min(PopulationSize) max(PopulationSize)
## 1 68799031 281190068
summary_of_country_population("Pakistan")
## mean(PopulationSize) median(PopulationSize) sd(PopulationSize)
## 1 118443599 103368740 67049426
## min(PopulationSize) max(PopulationSize)
## 1 35849256 247504505
# Visualization comparing the growth of China, India, United States, Indonesia, and Pakistan from 1950 to 2023
countries_in_question <- c("China", "India", "United States", "Indonesia", "Pakistan")
population_for_5_countries <- population |>
filter(CountryName == countries_in_question)
## Warning: There was 1 warning in `filter()`.
## ℹ In argument: `CountryName == countries_in_question`.
## Caused by warning in `CountryName == countries_in_question`:
## ! longer object length is not a multiple of shorter object length
ggplot(population_for_5_countries, aes(x = Year, y = PopulationSize, color = CountryName)) +
geom_line() +
labs(title = "Populatize Size based on Country Over 1950 to 2023", x = "Year", y = "Population Size", color = "Country") +
theme_minimal()
Test: ANOVA Test
My hypothesis is: \(H_0\): \(\mu_1\) = \(\mu_2\) = … = \(\mu_k\)
\(H_a\) = not all \(\mu\) are equal
anova_test <- aov(PopulationSize ~ CountryName, population_for_5_countries)
summary(anova_test)
## Df Sum Sq Mean Sq F value Pr(>F)
## CountryName 4 1.081e+19 2.702e+18 57.42 <2e-16 ***
## Residuals 70 3.294e+18 4.706e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(anova_test)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = PopulationSize ~ CountryName, data = population_for_5_countries)
##
## $CountryName
## diff lwr upr p adj
## India-China -180244294 -402043892 41555303 0.1651721
## Indonesia-China -861540880 -1083340477 -639741282 0.0000000
## Pakistan-China -914581155 -1136380752 -692781557 0.0000000
## United States-China -776603691 -998403289 -554804094 0.0000000
## Indonesia-India -681296585 -903096183 -459496988 0.0000000
## Pakistan-India -734336860 -956136458 -512537263 0.0000000
## United States-India -596359397 -818158994 -374559800 0.0000000
## Pakistan-Indonesia -53040275 -274839873 168759322 0.9622741
## United States-Indonesia 84937188 -136862409 306736786 0.8200483
## United States-Pakistan 137977463 -83822134 359777061 0.4155534
Result: p-value = 2e-16 < α =0.05. This indicates strong evidence against the null hypothesis. Overall, this test suggests that there are significant differences in population size among the 5 countries: China, India, United States, Indonesia, and Pakistan. Looking at the TukeyHSD results: Indonesia-China, Pakistan-China, United States-China, Indonesia-India, Pakistan-India, United States-India are all statistically significant difference since the p-adj < α = 0.05.
Based on the summaries that provided the mean, median, etc. we found that it correlates with the line graph that was displayed afterwards comparing each country and it does show that China has the highest population, followed by India, the US, Indonesia, and finally Pakistan. When looking at the means we see that China has the highest and looking at the graph, the growth has been increasing for all countries with all countries slowly taking slower increase near 2023. An interesting look at the graph is that China and India seem very close to each other and India could possibly take over in terms of highest population. Looking at the ANOVA test done, we can conclude that we reject the null hypothesis since the p-value is less than 0.05. Therefore, there is strong evidence that not all countries have same mean population.
This does the answer question on how has the population growth differ from China, India, United States, Indonesia, and Pakistan which is that China and India look to have similar population growths over the years while United States, Indonesia, and Pakistan have much similar growth to each other. As of 2023, China and India seem to be neck and neck while the other countries are still at their respective rank. For potential avenues for future research or analysis, looking at the possible reasons for higher population sizes could be done and tested. Another could be to compare all countries instead of just five to see how it performs when doing the anova test and the graphs that were used.
Ritchie, Hannah, et al. “Population Growth.” Our World in Data, 17 June 2024, ourworldindata.org/population-growth#explore-data-on-population-growth.