Introduction

This dataset contains information about population in countreis around the world. Our aim for the project is to dive into this dataset and look at Earth population from different perspectives.

#Loading libraries

#install.packages("tidyverse")   # for data cleaning
#install.packages("dplyr")  
#install.packages("reshape2")
#install.packages("maptools")    # for data visualization
#install.packages("ggthemes")  
#install.packages("gganimate")
#install.packages("cluster")     #for cluster analysis 
#install.packages("Rtsne")  
#install.packages("knitr")
#install.packages('data.table')

library(tidyverse)# for data cleaning
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)  
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(maptools) # for data visualization
## Warning: package 'maptools' was built under R version 4.2.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.2.2
## Checking rgeos availability: FALSE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()
library(ggthemes)  
## Warning: package 'ggthemes' was built under R version 4.2.2
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.2.2
library(cluster)#for cluster analysis 
## Warning: package 'cluster' was built under R version 4.2.2
library(Rtsne)  
## Warning: package 'Rtsne' was built under R version 4.2.2
library(knitr)
## Warning: package 'knitr' was built under R version 4.2.2
library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(dplyr)

First load and have a look on our data first:

popdata = read_csv("worldpop.csv")
## Rows: 234 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): CCA3, Country, Capital, Continent
## dbl (13): Rank, 2022 Population, 2020 Population, 2015 Population, 2010 Popu...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(popdata)

We can already see that some columns are not user friendly and should be renamed.

names(popdata)<-c('Rank','CCA3','Country','Capital','Continent','Population_2022', 'Population_2020', 'Population_2015','Population_2010', 'Population_2000', 'Population_1990', 'Population_1980', 'Population_1970', 'Area', 'Density','Growth_Rate','World_population_perc')

head(popdata)
## # A tibble: 6 × 17
##    Rank CCA3  Country    Capital Conti…¹ Popul…² Popul…³ Popul…⁴ Popul…⁵ Popul…⁶
##   <dbl> <chr> <chr>      <chr>   <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1    36 AFG   Afghanist… Kabul   Asia     4.11e7  3.90e7  3.38e7  2.82e7  1.95e7
## 2   138 ALB   Albania    Tirana  Europe   2.84e6  2.87e6  2.88e6  2.91e6  3.18e6
## 3    34 DZA   Algeria    Algiers Africa   4.49e7  4.35e7  3.95e7  3.59e7  3.08e7
## 4   213 ASM   American … Pago P… Oceania  4.43e4  4.62e4  5.14e4  5.48e4  5.82e4
## 5   203 AND   Andorra    Andorr… Europe   7.98e4  7.77e4  7.17e4  7.15e4  6.61e4
## 6    42 AGO   Angola     Luanda  Africa   3.56e7  3.34e7  2.81e7  2.34e7  1.64e7
## # … with 7 more variables: Population_1990 <dbl>, Population_1980 <dbl>,
## #   Population_1970 <dbl>, Area <dbl>, Density <dbl>, Growth_Rate <dbl>,
## #   World_population_perc <dbl>, and abbreviated variable names ¹​Continent,
## #   ²​Population_2022, ³​Population_2020, ⁴​Population_2015, ⁵​Population_2010,
## #   ⁶​Population_2000

Total Population from 1970-2022:

pop2022 = sum(popdata$Population_2022)
pop2020 = sum(popdata$Population_2020)
pop2015 = sum(popdata$Population_2015)
pop2010 = sum(popdata$Population_2010)
pop2000 = sum(popdata$Population_2000)
pop1990 = sum(popdata$Population_1990)
pop1980 = sum(popdata$Population_1980)
pop1970 = sum(popdata$Population_1970)

year <- c("2022","2020","2015","2010","2000","1990","1980","1970")
yearpop <- c(pop2022,pop2020,pop2015,pop2010,pop2000,pop1990,pop1980,pop1970)

totalpop <- data.frame(year, yearpop)

ggplot(totalpop, aes(x=year, y = yearpop, group = 1, col = "blue"))+
  geom_line(col = "blue")+
  geom_point(col = "blue")+
  labs(x="Year", y = "Total Population", title = "Total Population from 1970 - 2022")+
  scale_y_continuous(labels = function(x) format(x, 1e6, big.mark=",",scientific = FALSE))+
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.position="none")

Let’s look how population changed between years 1970-2022:

## Warning in melt(df): The melt generic in data.table has been passed a tbl_df
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(df). In the
## next version, this warning will become an error.
## No id variables; using all as measure variables
## Warning: Removed 349 rows containing non-finite values (`stat_boxplot()`).

From boxplots we can clearly see that world population increased over years. Outliers were removed from plot to display it better.

Plots and Analysis

We can now have a look on top countries in several categories below

Most Population Country

Least Population Country

Most Density Country

Least Density Country

Biggest Area Country

Smallest Area Country

Growth

Growth in %

Growth Across the World

Now lets see growth rate in countries all over the world:

world <- map_data("world") %>% 
  filter(! long > 180)

popdata$Country <- recode(popdata$Country
                                  ,'United States' = 'USA'
                                  ,'United Kingdom' = 'UK'
                                  ,'DR Congo'='Democratic Republic of the Congo' 
                                 ,'Republic of the Congo' = 'Republic of Congo')
world %>%
  merge(popdata, by.x = "region", by.y = "Country", all.x = T) %>%
  arrange(group, order) %>%
  ggplot(aes(x = long, y = lat, group = group, fill = Growth_Rate)) +
    geom_polygon() +
    coord_map("moll") +
    theme_map() 

Population Percent by Continents:

continentpop <- popdata %>% 
  select(Continent,World_population_perc)%>% 
  group_by(Continent)%>% 
  summarise(WorldPopPercent = sum(World_population_perc))

ggplot(continentpop, aes(x="", y=WorldPopPercent, fill=Continent)) +
  geom_bar(stat="identity", width=1, color="white") +
  ggtitle("Portion of Population per Continent")+
  coord_polar("y", start=0)+
   scale_fill_brewer(palette = "Paired")+
   theme_void()+theme(plot.title = element_text(hjust = 0.5))

Amination Global Population (1970-2022)

## Warning in melt(., id.vars = "Continent"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(.). In the next version, this warning will become an error.
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.

1-Child Policy Population

onechild <- popdata %>%
  filter(Country == "China" | Country == "India")

onechild <- onechild[c(-1,-2,-4,-5,-13:-17)]
typeof(onechild)
## [1] "list"
onechild <- setnames(onechild,old = c("Population_2022","Population_2020","Population_2015","Population_2010","Population_2000","Population_1990","Population_1980"), new = c("2022","2020","2015","2010","2000","1990","1980"))
  
  
onechild <- pivot_longer(onechild,c("2022","2020","2015","2010","2000","1990","1980"), names_to = "Year", values_to = "Population")

ggplot(onechild, aes(x=Year, y=Population, group=Country, color = Country)) +
  geom_line(linewidth = 1)+
  geom_point(size = 1.5)+
  scale_y_continuous(labels = function(x) format(x, 1e6, big.mark=",",scientific = FALSE))+
  ggtitle("Population of China and India from 1980 - 2022 during One-Child Policy") +
  theme_bw()+
  theme(plot.title = element_text(hjust = 0.5))