The following data is downloaded from the table Population aged 20-39 years, both sexes (%) from Gapminder

I would like to investigate the change in population of the 20-39 age group.

library(tidyr)
library(ggplot2)
library(ggthemes)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
tmp = read.csv('clean.csv', row.names = 1, check.names = F)
head(tmp)
##       Country     1950     1955     1960     1965     1970     1975
## 1 Afghanistan 28.29795 28.43349 28.48970 28.48662 28.55816 28.04832
## 2     Albania 26.60626 26.88406 27.56052 27.60835 26.94107 27.48855
## 3     Algeria 27.86155 27.70229 26.90064 25.84081 23.27611 23.74204
## 4      Angola 28.94356 28.42640 27.67019 26.77080 25.87539 25.85144
## 5   Argentina 32.70363 31.70320 30.47766 29.35833 28.73688 28.71751
## 6     Armenia 26.77515 28.53487 32.54818 29.75057 26.84670 28.01556
##       1980     1985     1990     1995     2000     2005     2010     2015
## 1 27.59733 27.21061 26.79068 26.74187 26.70789 26.96184 27.52782 28.25814
## 2 29.36775 31.27113 33.13070 31.82253 28.61799 28.40142 29.66235 31.48205
## 3 24.66773 26.70498 28.76355 30.51015 33.32022 35.83952 36.88980 36.35433
## 4 25.98676 26.25375 26.54975 26.43376 26.79647 27.17535 28.09078 29.13785
## 5 28.49329 28.35781 28.00960 28.05993 29.26176 29.99923 30.65952 30.85294
## 6 29.59974 33.37328 33.80361 30.63315 28.30679 27.80137 30.54028 32.13376
##       2020     2025     2030     2035     2040     2045     2050
## 1 29.05972 29.72047 30.16447 30.76133 31.50975 32.32804 33.06464
## 2 32.22522 29.84987 26.92420 24.76526 23.65464 24.14925 23.85710
## 3 33.45475 30.37476 28.45485 27.63377 28.13748 27.92545 26.76127
## 4 29.83065 30.76643 31.24774 31.83710 32.51237 32.74464 32.84910
## 5 29.91603 29.13497 28.47015 27.78811 27.14543 26.59227 25.78619
## 6 30.54156 28.08671 25.33123 24.38869 25.36069 25.17051 24.03049
df = gather(tmp,'year','n',2:22)
names(df)
## [1] "Country" "year"    "n"
head(df)
##       Country year        n
## 1 Afghanistan 1950 28.29795
## 2     Albania 1950 26.60626
## 3     Algeria 1950 27.86155
## 4      Angola 1950 28.94356
## 5   Argentina 1950 32.70363
## 6     Armenia 1950 26.77515
china = subset(df, Country=='China') 
ggplot(china, aes(x = year, y = n)) +
  geom_line(aes(group=1)) +
  scale_x_discrete(breaks=seq(1940, 2060, 5)) + 
  xlab('Year') + 
  ylab('Percentage of 20-39 population in China')

The graph above shows change in population (in percentage) of 20-39 years old in China. We can see that the ratio is dropping since year 2000.

The following graph shows the same data for 196 countries.

ggplot(df, aes(x = year, y = n)) +
  geom_line(aes(group=Country, color=Country))

The graph is too chaotic, so I will focus on the following countries instead:

-United States -Japan -Canada -China -Hong Kong

sub_df = filter(df, Country %in% c("United States", "Japan", "Canada", "China", "Hong Kong, China"))

ggplot(sub_df, aes(x = year, y = n)) +
  geom_line(aes(group=Country, color=Country)) +
  theme(axis.text.x= element_text(angle = 45, hjust=1))

As we can see from the graph, the population of people between age 20-39 is dropping for these countries and regions. This maybe caused by a decreasing trend in total population. To verify that, I downloaded and imported the total population data:

dfpop = read.csv('cleaned total population.csv', check.names = F)
dfpop[1] = NULL
str(dfpop)
## 'data.frame':    241 obs. of  65 variables:
##  $ country: Factor w/ 241 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ 1950   : num  8151455 1215002 8752997 18937 6197 ...
##  $ 1951   : num  8276820 1239971 8953373 19287 6692 ...
##  $ 1952   : num  8407148 1269302 9140990 19525 7247 ...
##  $ 1953   : num  8542906 1302536 9326099 19659 7856 ...
##  $ 1954   : num  8684494 1339240 9515990 19706 8514 ...
##  $ 1955   : num  8832253 1378998 9715002 19688 9217 ...
##  $ 1956   : num  8986449 1421417 9924347 19640 9964 ...
##  $ 1957   : num  9147286 1466124 10142284 19603 10754 ...
##  $ 1958   : num  9314915 1512765 10364614 19624 11585 ...
##  $ 1959   : num  9489453 1561012 10585506 19757 12460 ...
##  $ 1960   : num  9671046 1610565 10799997 20041 13377 ...
##  $ 1961   : num  9859928 1661158 11006643 20500 14337 ...
##  $ 1962   : num  10056480 1712563 11209845 21124 15337 ...
##  $ 1963   : num  10261254 1764593 11420845 21870 16373 ...
##  $ 1964   : num  10474903 1817098 11654905 22672 17438 ...
##  $ 1965   : num  10697983 1869942 11923002 23480 18529 ...
##  $ 1966   : num  10927724 1922993 12229853 24283 19640 ...
##  $ 1967   : num  11163656 1976140 12572629 25087 20772 ...
##  $ 1968   : num  11411022 2029314 12945462 25869 21931 ...
##  $ 1969   : num  11676990 2082474 13338918 26608 23127 ...
##  $ 1970   : num  11964906 2135599 13746185 27288 24364 ...
##  $ 1971   : num  12273101 2188650 14165889 27907 25656 ...
##  $ 1972   : num  12593688 2241623 14600659 28470 26997 ...
##  $ 1973   : num  12915499 2294578 15052371 28983 28357 ...
##  $ 1974   : num  13223928 2347607 15524137 29453 29688 ...
##  $ 1975   : num  13505544 2400801 16018195 29897 30967 ...
##  $ 1976   : num  13766792 2454255 16533323 30305 32156 ...
##  $ 1977   : num  14003408 2508026 17068212 30696 33279 ...
##  $ 1978   : num  14179656 2562121 17624756 31139 34432 ...
##  $ 1979   : num  14249493 2616530 18205468 31727 35753 ...
##  $ 1980   : num  14185729 2671300 18811199 32526 37328 ...
##  $ 1981   : num  13984092 2725029 19442423 33557 39226 ...
##  $ 1982   : num  13672870 2777592 20095648 34797 41390 ...
##  $ 1983   : num  13300056 2831682 20762767 36203 43636 ...
##  $ 1984   : num  12931791 2891004 21433070 37706 45702 ...
##  $ 1985   : num  12625292 2957390 22098298 39253 47414 ...
##  $ 1986   : num  12372113 3033393 22753511 40834 48653 ...
##  $ 1987   : num  12183387 3116009 23398470 42446 49504 ...
##  $ 1988   : num  12156685 3194854 24035237 44048 50236 ...
##  $ 1989   : num  12414686 3255859 24668100 45595 51241 ...
##  $ 1990   : num  13032161 3289483 25299182 47052 52773 ...
##  $ 1991   : num  14069854 3291695 25930560 48402 54996 ...
##  $ 1992   : num  15472076 3266983 26557969 49648 57767 ...
##  $ 1993   : num  17053213 3224901 27169903 50801 60670 ...
##  $ 1994   : num  18553819 3179442 27751086 51885 63111 ...
##  $ 1995   : num  19789880 3141102 28291591 52919 64699 ...
##  $ 1996   : num  20684982 3112597 28786855 53901 65227 ...
##  $ 1997   : num  21299350 3091902 29242917 54834 64905 ...
##  $ 1998   : num  21752257 3079037 29673694 55745 64246 ...
##  $ 1999   : num  22227543 3072725 30099010 56667 63985 ...
##  $ 2000   : num  22856302 3071856 30533827 57625 64634 ...
##  $ 2001   : num  23677385 3077378 30982214 58633 66390 ...
##  $ 2002   : num  24639841 3089778 31441848 59687 69043 ...
##  $ 2003   : num  25678639 3106701 31913462 60774 72203 ...
##  $ 2004   : num  26693486 3124861 32396048 61871 75292 ...
##  $ 2005   : num  27614718 3141800 32888449 62962 77888 ...
##  $ 2006   : num  28420974 3156607 33391954 64045 79874 ...
##  $ 2007   : num  29145841 3169665 33906605 65130 81390 ...
##  $ 2008   : num  29839994 3181397 34428028 66217 82577 ...
##  $ 2009   : num  30577756 3192723 34950168 67312 83677 ...
##  $ 2010   : num  31411743 3204284 35468208 68420 84864 ...
##  $ 2011   : num  32358260 3215988 35980193 69543 86165 ...
##  $ 2012   : num  33397058 3227373 36485828 70680 87518 ...
##  $ 2013   : num  34499915 3238316 36983924 71834 88909 ...
dfpop = gather(dfpop,'year','count',2:65)
head(dfpop)
##          country year   count
## 1    Afghanistan 1950 8151455
## 2        Albania 1950 1215002
## 3        Algeria 1950 8752997
## 4 American Samoa 1950   18937
## 5        Andorra 1950    6197
## 6         Angola 1950 4147509

Now, I draw the total population of the aforementioned countries:

selected = dfpop %>%
  filter(country %in% c("United States", "Japan", "Canada", "China", "Hong Kong, China"))
head(selected)
##            country year     count
## 1           Canada 1950  13736997
## 2            China 1950 550771433
## 3 Hong Kong, China 1950   1973998
## 4            Japan 1950  82199470
## 5    United States 1950 157813040
## 6           Canada 1951  14132558
ggplot(selected, aes(x=year, y=count))+
  geom_line(aes(group=country,color=country)) +
  theme(axis.text.x= element_text(angle = 45, hjust=1))

All these countries and regions have increased populations. This is not what I expected. I thought the total population might drop as well. Maybe the life expectancy is longer now so the majority of the population has moved to older age group.