library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
country_stat <- read.csv("~/Desktop/BANA 4137/country_stat.csv")
QUESTION 1:
colSums(is.na(country_stat))
## country year infant_mortality life_expectancy
## 0 0 1453 0
## fertility population gdp continent
## 187 185 2972 0
## region
## 0
missing_rows <- which(apply(is.na(country_stat), 1, any))
head(country_stat[missing_rows, ])
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.4 62.87 6.19
## 3 Angola 1960 208.0 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 6 Armenia 1960 NA 66.86 4.55
## 7 Aruba 1960 NA 65.66 4.82
## 10 Azerbaijan 1960 NA 61.33 5.57
## population gdp continent region
## 1 1636054 NA Europe Southern Europe
## 3 5270844 NA Africa Middle Africa
## 4 54681 NA Americas Caribbean
## 6 1867396 NA Asia Western Asia
## 7 54208 NA Americas Caribbean
## 10 3897889 NA Asia Western Asia
QUESTION 2:
num_countries <- length(unique(country_stat$country))
num_years <- length(unique(country_stat$year))
cat("Number of unique countries:", num_countries, "\n")
## Number of unique countries: 185
cat("Number of years of observations:", num_years, "\n")
## Number of years of observations: 57
QUESTION 3:
GDP_per_capita <- country_stat %>%
mutate( GDP_per_capita = gdp/population)
head(country_stat)
## country year infant_mortality life_expectancy fertility
## 1 Albania 1960 115.40 62.87 6.19
## 2 Algeria 1960 148.20 47.50 7.65
## 3 Angola 1960 208.00 35.98 7.32
## 4 Antigua and Barbuda 1960 NA 62.97 4.43
## 5 Argentina 1960 59.87 65.39 3.11
## 6 Armenia 1960 NA 66.86 4.55
## population gdp continent region
## 1 1636054 NA Europe Southern Europe
## 2 11124892 13828152297 Africa Northern Africa
## 3 5270844 NA Africa Middle Africa
## 4 54681 NA Americas Caribbean
## 5 20619075 108322326649 Americas South America
## 6 1867396 NA Asia Western Asia
QUESTION 4:
Question 1: Does a correlation exist between infant mortality rate and fertility rate? How does this correlation vary across different continents and regions?
ggplot(country_stat, aes(x = year, y = infant_mortality, color = region, group = region)) +
geom_line() +
labs(x = "Year", y = "Infant Mortality Rate", color = "Region") +
facet_wrap(~continent) +
ggtitle("Infant mortality rates categorized by region and continent") +
theme_bw()
## Warning: Removed 249 rows containing missing values (`geom_line()`).
The generated plots illustrate the evolution of infant mortality rates and fertility rates across countries in the dataset over time, organized by continent. Asia experienced a surge in fertility rates during the 1980s. Additionally, the visual analysis suggests a positive correlation between infant mortality rate and fertility rate.
Question 2: Which geographic region exhibits the highest GDP per capita, and how has this metric evolved over time?
data_gdp <- GDP_per_capita[, c("year", "region", "GDP_per_capita")]
data_gdp <- aggregate(GDP_per_capita ~ region + year, data_gdp, mean)
ggplot(data_gdp, aes(x = year, y = GDP_per_capita, color = region)) +
geom_line() +
labs(x = "Year", y = "GDP per capita", title = "GDP per capita by region over time")
Question: 3: Has there been a shift in population distribution among continents over the years?
country_continents <- country_stat %>%
group_by(year) %>%
mutate(Pct_Global_Population = population / sum(population) * 100)
ggplot(country_continents, aes(x = year, y = Pct_Global_Population, fill = continent)) +
geom_area() +
labs(title = "Population Distribution Among Continents Over Years",
x = "Year",
y = "Percentage of Global Population") +
theme_minimal()
## Warning: Removed 185 rows containing non-finite values (`stat_align()`).
Question 4: How does the distribution of GDP per capita vary across different regions of the world over the years?
country_stat <- country_stat %>%
mutate(GDP_per_capita = gdp / population)
ggplot(country_stat, aes(x = GDP_per_capita, fill = region)) +
geom_histogram(binwidth = 1000, position = "dodge") +
labs(title = "Distribution of GDP per Capita Across Regions",
x = "GDP per Capita",
y = "Frequency") +
theme_minimal()
## Warning: Removed 2972 rows containing non-finite values (`stat_bin()`).