This dataset contains information about population in countreis around the world. Our aim for the project is to dive into this dataset and look at Earth population from different perspectives.
#Loading libraries
#install.packages("tidyverse") # for data cleaning
#install.packages("dplyr")
#install.packages("reshape2")
#install.packages("maptools") # for data visualization
#install.packages("ggthemes")
#install.packages("gganimate")
#install.packages("cluster") #for cluster analysis
#install.packages("Rtsne")
#install.packages("knitr")
#install.packages('data.table')
library(tidyverse)# for data cleaning
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(maptools) # for data visualization
## Warning: package 'maptools' was built under R version 4.2.2
## Loading required package: sp
## Warning: package 'sp' was built under R version 4.2.2
## Checking rgeos availability: FALSE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.2
library(gganimate)
## Warning: package 'gganimate' was built under R version 4.2.2
library(cluster)#for cluster analysis
## Warning: package 'cluster' was built under R version 4.2.2
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.2.2
library(knitr)
## Warning: package 'knitr' was built under R version 4.2.2
library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(dplyr)
First load and have a look on our data first:
popdata = read_csv("worldpop.csv")
## Rows: 234 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): CCA3, Country, Capital, Continent
## dbl (13): Rank, 2022 Population, 2020 Population, 2015 Population, 2010 Popu...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
view(popdata)
We can already see that some columns are not user friendly and should be renamed.
names(popdata)<-c('Rank','CCA3','Country','Capital','Continent','Population_2022', 'Population_2020', 'Population_2015','Population_2010', 'Population_2000', 'Population_1990', 'Population_1980', 'Population_1970', 'Area', 'Density','Growth_Rate','World_population_perc')
head(popdata)
## # A tibble: 6 × 17
## Rank CCA3 Country Capital Conti…¹ Popul…² Popul…³ Popul…⁴ Popul…⁵ Popul…⁶
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 36 AFG Afghanist… Kabul Asia 4.11e7 3.90e7 3.38e7 2.82e7 1.95e7
## 2 138 ALB Albania Tirana Europe 2.84e6 2.87e6 2.88e6 2.91e6 3.18e6
## 3 34 DZA Algeria Algiers Africa 4.49e7 4.35e7 3.95e7 3.59e7 3.08e7
## 4 213 ASM American … Pago P… Oceania 4.43e4 4.62e4 5.14e4 5.48e4 5.82e4
## 5 203 AND Andorra Andorr… Europe 7.98e4 7.77e4 7.17e4 7.15e4 6.61e4
## 6 42 AGO Angola Luanda Africa 3.56e7 3.34e7 2.81e7 2.34e7 1.64e7
## # … with 7 more variables: Population_1990 <dbl>, Population_1980 <dbl>,
## # Population_1970 <dbl>, Area <dbl>, Density <dbl>, Growth_Rate <dbl>,
## # World_population_perc <dbl>, and abbreviated variable names ¹Continent,
## # ²Population_2022, ³Population_2020, ⁴Population_2015, ⁵Population_2010,
## # ⁶Population_2000
Total Population from 1970-2022:
pop2022 = sum(popdata$Population_2022)
pop2020 = sum(popdata$Population_2020)
pop2015 = sum(popdata$Population_2015)
pop2010 = sum(popdata$Population_2010)
pop2000 = sum(popdata$Population_2000)
pop1990 = sum(popdata$Population_1990)
pop1980 = sum(popdata$Population_1980)
pop1970 = sum(popdata$Population_1970)
year <- c("2022","2020","2015","2010","2000","1990","1980","1970")
yearpop <- c(pop2022,pop2020,pop2015,pop2010,pop2000,pop1990,pop1980,pop1970)
totalpop <- data.frame(year, yearpop)
ggplot(totalpop, aes(x=year, y = yearpop, group = 1, col = "blue"))+
geom_line(col = "blue")+
geom_point(col = "blue")+
labs(x="Year", y = "Total Population", title = "Total Population from 1970 - 2022")+
scale_y_continuous(labels = function(x) format(x, 1e6, big.mark=",",scientific = FALSE))+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position="none")
Let’s look how population changed between years 1970-2022:
## Warning in melt(df): The melt generic in data.table has been passed a tbl_df
## and will attempt to redirect to the relevant reshape2 method; please note that
## reshape2 is deprecated, and this redirection is now deprecated as well. To
## continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(df). In the
## next version, this warning will become an error.
## No id variables; using all as measure variables
## Warning: Removed 349 rows containing non-finite values (`stat_boxplot()`).
From boxplots we can clearly see that world population increased over years. Outliers were removed from plot to display it better.
We can now have a look on top countries in several categories below
Now lets see growth rate in countries all over the world:
world <- map_data("world") %>%
filter(! long > 180)
popdata$Country <- recode(popdata$Country
,'United States' = 'USA'
,'United Kingdom' = 'UK'
,'DR Congo'='Democratic Republic of the Congo'
,'Republic of the Congo' = 'Republic of Congo')
world %>%
merge(popdata, by.x = "region", by.y = "Country", all.x = T) %>%
arrange(group, order) %>%
ggplot(aes(x = long, y = lat, group = group, fill = Growth_Rate)) +
geom_polygon() +
coord_map("moll") +
theme_map()
continentpop <- popdata %>%
select(Continent,World_population_perc)%>%
group_by(Continent)%>%
summarise(WorldPopPercent = sum(World_population_perc))
ggplot(continentpop, aes(x="", y=WorldPopPercent, fill=Continent)) +
geom_bar(stat="identity", width=1, color="white") +
ggtitle("Portion of Population per Continent")+
coord_polar("y", start=0)+
scale_fill_brewer(palette = "Paired")+
theme_void()+theme(plot.title = element_text(hjust = 0.5))
## Warning in melt(., id.vars = "Continent"): The melt generic in data.table has
## been passed a tbl_df and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(.). In the next version, this warning will become an error.
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.
onechild <- popdata %>%
filter(Country == "China" | Country == "India")
onechild <- onechild[c(-1,-2,-4,-5,-13:-17)]
typeof(onechild)
## [1] "list"
onechild <- setnames(onechild,old = c("Population_2022","Population_2020","Population_2015","Population_2010","Population_2000","Population_1990","Population_1980"), new = c("2022","2020","2015","2010","2000","1990","1980"))
onechild <- pivot_longer(onechild,c("2022","2020","2015","2010","2000","1990","1980"), names_to = "Year", values_to = "Population")
ggplot(onechild, aes(x=Year, y=Population, group=Country, color = Country)) +
geom_line(linewidth = 1)+
geom_point(size = 1.5)+
scale_y_continuous(labels = function(x) format(x, 1e6, big.mark=",",scientific = FALSE))+
ggtitle("Population of China and India from 1980 - 2022 during One-Child Policy") +
theme_bw()+
theme(plot.title = element_text(hjust = 0.5))