library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
country_stat <- read.csv("~/Desktop/BANA 4137/country_stat.csv")

QUESTION 1:

colSums(is.na(country_stat))
##          country             year infant_mortality  life_expectancy 
##                0                0             1453                0 
##        fertility       population              gdp        continent 
##              187              185             2972                0 
##           region 
##                0
missing_rows <- which(apply(is.na(country_stat), 1, any))
head(country_stat[missing_rows, ])
##                country year infant_mortality life_expectancy fertility
## 1              Albania 1960            115.4           62.87      6.19
## 3               Angola 1960            208.0           35.98      7.32
## 4  Antigua and Barbuda 1960               NA           62.97      4.43
## 6              Armenia 1960               NA           66.86      4.55
## 7                Aruba 1960               NA           65.66      4.82
## 10          Azerbaijan 1960               NA           61.33      5.57
##    population gdp continent          region
## 1     1636054  NA    Europe Southern Europe
## 3     5270844  NA    Africa   Middle Africa
## 4       54681  NA  Americas       Caribbean
## 6     1867396  NA      Asia    Western Asia
## 7       54208  NA  Americas       Caribbean
## 10    3897889  NA      Asia    Western Asia

QUESTION 2:

num_countries <- length(unique(country_stat$country))
num_years <- length(unique(country_stat$year))
cat("Number of unique countries:", num_countries, "\n")
## Number of unique countries: 185
cat("Number of years of observations:", num_years, "\n")
## Number of years of observations: 57

QUESTION 3:

GDP_per_capita <- country_stat %>%
  mutate( GDP_per_capita = gdp/population)
head(country_stat)
##               country year infant_mortality life_expectancy fertility
## 1             Albania 1960           115.40           62.87      6.19
## 2             Algeria 1960           148.20           47.50      7.65
## 3              Angola 1960           208.00           35.98      7.32
## 4 Antigua and Barbuda 1960               NA           62.97      4.43
## 5           Argentina 1960            59.87           65.39      3.11
## 6             Armenia 1960               NA           66.86      4.55
##   population          gdp continent          region
## 1    1636054           NA    Europe Southern Europe
## 2   11124892  13828152297    Africa Northern Africa
## 3    5270844           NA    Africa   Middle Africa
## 4      54681           NA  Americas       Caribbean
## 5   20619075 108322326649  Americas   South America
## 6    1867396           NA      Asia    Western Asia

QUESTION 4:

Question 1: Does a correlation exist between infant mortality rate and fertility rate? How does this correlation vary across different continents and regions?

ggplot(country_stat, aes(x = year, y = infant_mortality, color = region, group = region)) +
  geom_line() +
  labs(x = "Year", y = "Infant Mortality Rate", color = "Region") +
  facet_wrap(~continent) +
  ggtitle("Infant mortality rates categorized by region and continent") +
  theme_bw()
## Warning: Removed 249 rows containing missing values (`geom_line()`).

The generated plots illustrate the evolution of infant mortality rates and fertility rates across countries in the dataset over time, organized by continent. Asia experienced a surge in fertility rates during the 1980s. Additionally, the visual analysis suggests a positive correlation between infant mortality rate and fertility rate.

Question 2: Which geographic region exhibits the highest GDP per capita, and how has this metric evolved over time?

data_gdp <- GDP_per_capita[, c("year", "region", "GDP_per_capita")]
data_gdp <- aggregate(GDP_per_capita ~ region + year, data_gdp, mean)

ggplot(data_gdp, aes(x = year, y = GDP_per_capita, color = region)) +
  geom_line() +
  labs(x = "Year", y = "GDP per capita", title = "GDP per capita by region over time")

Question: 3: Has there been a shift in population distribution among continents over the years?

country_continents <- country_stat %>%
  group_by(year) %>%
  mutate(Pct_Global_Population = population / sum(population) * 100)

ggplot(country_continents, aes(x = year, y = Pct_Global_Population, fill = continent)) +
  geom_area() +
  labs(title = "Population Distribution Among Continents Over Years",
       x = "Year",
       y = "Percentage of Global Population") +
  theme_minimal()
## Warning: Removed 185 rows containing non-finite values (`stat_align()`).

Question 4: How does the distribution of GDP per capita vary across different regions of the world over the years?

country_stat <- country_stat %>%
  mutate(GDP_per_capita = gdp / population)

ggplot(country_stat, aes(x = GDP_per_capita, fill = region)) +
  geom_histogram(binwidth = 1000, position = "dodge") +
  labs(title = "Distribution of GDP per Capita Across Regions",
       x = "GDP per Capita",
       y = "Frequency") +
  theme_minimal()
## Warning: Removed 2972 rows containing non-finite values (`stat_bin()`).