library(mlr)
## Loading required package: ParamHelpers
## Warning: replacing previous import 'BBmisc::isFALSE' by
## 'backports::isFALSE' when loading 'mlr'
library(tidyr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:mlr':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:mlr':
##
## resample
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(editrules)
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
##
## Attaching package: 'editrules'
## The following objects are masked from 'package:igraph':
##
## blocks, normalize
## The following object is masked from 'package:dplyr':
##
## contains
## The following object is masked from 'package:tidyr':
##
## separate
## The following object is masked from 'package:ParamHelpers':
##
## isFeasible
library(stringr)
library(MVN)
## sROC 0.1-2 loaded
library(forecast)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:igraph':
##
## %--%
## The following object is masked from 'package:base':
##
## date
Now, the data processing process has been finished.
From the merged dataset, a dataset “north_america” which contains population information of North America countries—United States and Canada, has been obtained through merging and filtering 3 datasets containing the world male, female population and population’s regional information repectively.
The north_america dataset contains 10 variables with 2 character variables “Country.Code” and “Indicator.Code”, 4 factor variables “Country.Name”, “Indicator.Name”, “Region” and “IncomeGroup”, 3 numeric variables, “male_female_population”, “Population.percentage” and “transformation” and 1 date type variable, “year”. All the 10 variables has been converted into proper date types. The factor variable “IncomeGroup” has been order according to its levels , “Low income”, “Lower middle income”, “Upper middle income” and “High income”. And the merged datasets has been converted from untidy format into tidy format. All variables missing values and inconsistencies has been scaned and proporly dealed with.
The outliers of the numeric variable “male_female_population” which contains the size of population for male and female of each country has been scaned and coped by optimal boxcox transformation.
And the transformed variable from the “male_female_population” variable has been created by naming “transformed.population”. The transformed variable can also decrease the skewness to make the variable more normally distributed, which be used for further population analysis and forcast.
This analysis has retrieved three datasets of the total male and female population across different countries around the world from 1960 to 2016, and the regional information on the areas of these countries.
The datasets for total male and female population accross different countries containing the following variables. The “Country Name” and “Country Code” record the name of different countries and their abbreviations respectively. The “Indicator Name” and “Indicator Code” record the gender of the observations of the population data and their abreviation code respectively. The population size of each country from year 1960 to 2016 has been recorded in individual columns with each column occupied by each year.
The dataset called “Metadata_Country_API_SP.POP.TOTL.FE.IN_DS2_en_csv_v2_9952571” containing the information on regional and income levels also has been used. There are five variables in there, the “Country Code” record the name of different countries , the “Region” denoting which continents the countries belongs to, the “IncomeGroup” conveying the income level of differenting countries, the “SpecialNotes” recording different specific notes and the “TableName” containing different countries name.
All the three datasets are open sourced and can be find at the following websies: https://data.worldbank.org/indicator/SP.POP.TOTL.MA.IN https://data.worldbank.org/indicator/SP.POP.TOTL.FE.IN
A clear description of data sets, their sources, and variable descriptions should be provided. In this section, you must also provide the R codes with outputs (head of data sets) that you used to import/read/scrape the data set. You need to fulfil the minimum requirement #1 and merge at least two data sets to create the one you are going to work on. In addition to the R codes and outputs, you need to explain the steps that you have taken.
#Read the female population dataset.
fe<- read.csv("~/Desktop/population_female.csv", header =TRUE,skip=4)
#Read the male population dataset.
ma<- read.csv("~/Desktop/population_male.csv", header =TRUE,skip=4)
#Read the regional and income level information contained dataset.
region<- read.csv("~/Desktop/pupulation region.csv",header = TRUE)
#Before conducting the gathering method to the dataset, the Column, X2017, in both "ma" and "fe" datasets, has to be delete due to no information contained in there.
fe <- select(fe, -X2017)
ma <- select(ma, -X2017)
head(fe)
## Country.Name Country.Code Indicator.Name Indicator.Code X1960
## 1 Aruba ABW Population, female SP.POP.TOTL.FE.IN 27637
## 2 Afghanistan AFG Population, female SP.POP.TOTL.FE.IN 4346990
## 3 Angola AGO Population, female SP.POP.TOTL.FE.IN 2900597
## 4 Albania ALB Population, female SP.POP.TOTL.FE.IN 780595
## 5 Andorra AND Population, female SP.POP.TOTL.FE.IN NA
## 6 Arab World ARB Population, female SP.POP.TOTL.FE.IN 45909546
## X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968
## 1 28254 28655 28907 29094 29268 29458 29631 29804
## 2 4437679 4532368 4631212 4734371 4842015 4952722 5066254 5185164
## 3 2956543 3014082 3072260 3129657 3185518 3239546 3292896 3348020
## 4 805305 830317 855238 880256 904823 928927 953600 981013
## 5 NA NA NA NA NA NA NA NA
## 6 47174491 48482890 49837622 51242480 52699695 54215933 55790717 57411074
## X1969 X1970 X1971 X1972 X1973 X1974 X1975 X1976
## 1 29988 30178 30401 30634 30889 31065 31150 31111
## 2 5313009 5451441 5599707 5753953 5908686 6056668 6191328 6315722
## 3 3408200 3475917 3552328 3637187 3729918 3829313 3934592 4045039
## 4 1009808 1035954 1061521 1088570 1114865 1141014 1167738 1193877
## 5 NA NA NA NA NA NA NA NA
## 6 59058941 60724221 62398727 64094688 65847194 67704063 69698805 71842855
## X1977 X1978 X1979 X1980 X1981 X1982 X1983 X1984
## 1 30989 30834 30753 30805 31047 31457 31912 32227
## 2 6427910 6510970 6543069 6511413 6411284 6255675 6072080 5898422
## 3 4161240 4285197 4419637 4566089 4726261 4898495 5076921 5253511
## 4 1220603 1246195 1271298 1297791 1324521 1353597 1383430 1413336
## 5 NA NA NA NA NA NA NA NA
## 6 74123776 76521599 79005258 81550074 84149697 86805054 89506982 92246331
## X1985 X1986 X1987 X1988 X1989 X1990 X1991
## 1 32303 32055 31571 31104 30993 31491 32693
## 2 5764972 5674753 5627943 5647569 5760223 5981147 6327356
## 3 5422852 5581981 5733487 5884261 6044247 6220388 6416266
## 4 1442682 1469816 1497724 1525329 1568794 1603298 1604803
## 5 NA NA NA NA NA NA NA
## 6 95015336 97806638 100615528 103440246 106280987 110113892 113029317
## X1992 X1993 X1994 X1995 X1996 X1997 X1998
## 1 34482 36633 38785 40708 42294 43615 44757
## 2 6785762 7304033 7809329 8250185 8603315 8885666 9134961
## 3 6629273 6852909 7077584 7296962 7508648 7716678 7928906
## 4 1610619 1617772 1621629 1618647 1607089 1588481 1566199
## 5 NA NA NA NA NA NA NA
## 6 115137501 118134381 121111744 124723010 127619185 130509484 133371678
## X1999 X2000 X2001 X2002 X2003 X2004 X2005
## 1 45856 47012 48252 49506 50707 51712 52456
## 2 9408613 9746770 10162500 10637846 11144964 11642547 12101551
## 3 8156453 8407379 8684468 8985544 9307609 9645607 9995796
## 4 1544994 1528046 1511707 1509000 1507666 1505935 1501423
## 5 NA NA NA NA NA NA NA
## 6 136236749 139113662 141998877 144904104 147871961 150957572 154198970
## X2006 X2007 X2008 X2009 X2010 X2011 X2012
## 1 52896 53083 53104 53112 53202 53404 53701
## 2 12511524 12884170 13239157 13606016 14005473 14444001 14912657
## 3 10357453 10731777 11119158 11520427 11936016 12366067 12809782
## 4 1493529 1482744 1470945 1460006 1451422 1445814 1441287
## 5 NA NA NA NA NA NA NA
## 6 157610509 161177796 164879183 168667331 172508052 176400834 180343747
## X2013 X2014 X2015 X2016
## 1 54060 54417 54743 55023
## 2 15398276 15881092 16346869 16791609
## 3 13265577 13731363 14205741 14688058
## 4 1436444 1431651 1426369 1423809
## 5 NA NA NA NA
## 6 184306583 188253495 192160815 196007048
head(ma)
## Country.Name Country.Code Indicator.Name Indicator.Code X1960
## 1 Aruba ABW Population, male SP.POP.TOTL.MA.IN 26574
## 2 Afghanistan AFG Population, male SP.POP.TOTL.MA.IN 4649361
## 3 Angola AGO Population, male SP.POP.TOTL.MA.IN 2742585
## 4 Albania ALB Population, male SP.POP.TOTL.MA.IN 828205
## 5 Andorra AND Population, male SP.POP.TOTL.MA.IN NA
## 6 Arab World ARB Population, male SP.POP.TOTL.MA.IN 46581386
## X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968
## 1 27184 27570 27788 27938 28092 28257 28424 28582
## 2 4729085 4813500 4902742 4996990 5096399 5199609 5306376 5419182
## 3 2796481 2851979 2908157 2963664 3017781 3070224 3122099 3175771
## 4 854495 881002 907383 933879 959968 985646 1011998 1041259
## 5 NA NA NA NA NA NA NA NA
## 6 47870006 49199404 50573454 51997422 53475293 55014660 56616215 58269091
## X1969 X1970 X1971 X1972 X1973 X1974 X1975 X1976
## 1 28738 28885 29039 29206 29354 29463 29507 29475
## 2 5541419 5674682 5818118 5967987 6119136 6264873 6398958 6524577
## 3 3234432 3300464 3374941 3457647 3548042 3645025 3747887 3855958
## 4 1071887 1099525 1126332 1154556 1181887 1209110 1237093 1264649
## 5 NA NA NA NA NA NA NA NA
## 6 59957601 61674153 63408692 65174687 67016222 68992698 71144493 73489523
## X1977 X1978 X1979 X1980 X1981 X1982 X1983 X1984
## 1 29377 29269 29227 29291 29520 29888 30289 30609
## 2 6639628 6726764 6763626 6736957 6642670 6493970 6317189 6148693
## 3 3969748 4090950 4221884 4363811 4518246 4683661 4854641 5023810
## 4 1292943 1320071 1346534 1374206 1401535 1430681 1460530 1491093
## 5 NA NA NA NA NA NA NA NA
## 6 76009278 78662125 81387230 84139416 86902253 89685030 92498845 95364425
## X1985 X1986 X1987 X1988 X1989 X1990 X1991
## 1 30723 30589 30262 29975 30039 30658 31929
## 2 6018078 5926288 5874818 5893319 6017386 6267967 6666301
## 3 5186190 5339056 5484781 5629707 5782990 5951053 6137180
## 4 1522080 1552819 1585881 1617007 1659149 1683244 1661987
## 5 NA NA NA NA NA NA NA
## 6 98294965 101287129 104327021 107404525 110506415 114621554 117800551
## X1992 X1993 X1994 X1995 X1996 X1997 X1998
## 1 33753 35871 37915 39616 40906 41836 42520
## 2 7195469 7791066 8363390 8849356 9219569 9495939 9729038
## 3 6339072 6550825 6763717 6972032 7173636 7372303 7575412
## 4 1636420 1609515 1585907 1569137 1560944 1559800 1562331
## 5 NA NA NA NA NA NA NA
## 6 119899678 123151710 126324186 130306661 133224277 136065591 138863468
## X1999 X2000 X2001 X2002 X2003 X2004 X2005
## 1 43149 43841 44646 45486 46310 47025 47575
## 2 9995063 10346986 10803963 11342077 11919887 12476432 12969247
## 3 7793313 8033545 8298798 8587105 8895760 9220109 9556746
## 4 1563784 1560981 1548466 1542010 1531950 1521004 1510064
## 5 NA NA NA NA NA NA NA
## 6 141726120 144718354 147851480 151122471 154562558 158204457 162065758
## X2006 X2007 X2008 X2009 X2010 X2011 X2012
## 1 47936 48137 48249 48341 48467 48649 48876
## 2 13381926 13732622 14054874 14398315 14797694 15264598 15784301
## 3 9904946 10265910 10640262 11029120 11433115 11852498 12286368
## 4 1499018 1487273 1476369 1467513 1461599 1459381 1459114
## 5 NA NA NA NA NA NA NA
## 6 166162755 170476001 174946300 179477763 184000856 188495044 192963246
## X2013 X2014 X2015 X2016
## 1 49127 49378 49598 49799
## 2 16333412 16876928 17389625 17864423
## 3 12732763 13189103 13653564 14125405
## 4 1458648 1457453 1454334 1452292
## 5 NA NA NA NA
## 6 197395503 201789533 206144145 210445642
head(region)
## Country.Code Region IncomeGroup
## 1 ABW Latin America & Caribbean High income
## 2 AFG South Asia Low income
## 3 AGO Sub-Saharan Africa Lower middle income
## 4 ALB Europe & Central Asia Upper middle income
## 5 AND Europe & Central Asia High income
## 6 ARB
## SpecialNotes
## 1 SNA data for 2000-2011 are updated from official government statistics; 1994-1999 from UN databases. Base year has changed from 1995 to 2000.
## 2 Fiscal year end: March 20; reporting period for national accounts data is calendar year, estimated to insure consistency between national accounts and fiscal data. National accounts data are sourced from the IMF and differ from the Central Statistics Organization numbers due to exclusion of the opium economy.
## 3
## 4
## 5 WB-3 code changed from ADO to AND to align with ISO code.
## 6 Arab World aggregate. Arab World is composed of members of the League of Arab States.
## TableName
## 1 Aruba
## 2 Afghanistan
## 3 Angola
## 4 Albania
## 5 Andorra
## 6 Arab World
#It can be seen that the two datasets are not tidy due to the fact that the year of population as a variable should have occupied just one culumn but actually occupied 41 column with each year occupying one column. Accordingly, the amount of population each year which should have occupying just one column has occupying 57 column as the year variable does. Therefore, it is necessary to fisrt change the non-tidy data into the tidy one.
#As the "ma" and "fe" datasets are in untidy format which has been discussed before, it is necessary to change the untidy into the tidy one.
#Get tidy format datasets, "matidy" and "fetidy".
fetidy <-gather(fe,year,female_population,X1960:X2016)
head(fetidy)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Aruba ABW Population, female SP.POP.TOTL.FE.IN X1960
## 2 Afghanistan AFG Population, female SP.POP.TOTL.FE.IN X1960
## 3 Angola AGO Population, female SP.POP.TOTL.FE.IN X1960
## 4 Albania ALB Population, female SP.POP.TOTL.FE.IN X1960
## 5 Andorra AND Population, female SP.POP.TOTL.FE.IN X1960
## 6 Arab World ARB Population, female SP.POP.TOTL.FE.IN X1960
## female_population
## 1 27637
## 2 4346990
## 3 2900597
## 4 780595
## 5 NA
## 6 45909546
matidy <-gather(ma,year,male_population,X1960:X2016)
head(matidy)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Aruba ABW Population, male SP.POP.TOTL.MA.IN X1960
## 2 Afghanistan AFG Population, male SP.POP.TOTL.MA.IN X1960
## 3 Angola AGO Population, male SP.POP.TOTL.MA.IN X1960
## 4 Albania ALB Population, male SP.POP.TOTL.MA.IN X1960
## 5 Andorra AND Population, male SP.POP.TOTL.MA.IN X1960
## 6 Arab World ARB Population, male SP.POP.TOTL.MA.IN X1960
## male_population
## 1 26574
## 2 4649361
## 3 2742585
## 4 828205
## 5 NA
## 6 46581386
#Change the variable name "male_population" and "female_population" in both "matidy" and "fetidy" datasets into the same variable name "male_female_population"
colnames(fetidy)[6] <- "male_female_population"
colnames(matidy)[6] <- "male_female_population"
#Merge the two datasets "matidy" and "fetidy" into dataset "mafamale".
mafemale<-bind_rows(fetidy,matidy)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
#Join another dataset "region" into "mafamale".
region<-select(region,Country.Code,Region,IncomeGroup)
head(region)
## Country.Code Region IncomeGroup
## 1 ABW Latin America & Caribbean High income
## 2 AFG South Asia Low income
## 3 AGO Sub-Saharan Africa Lower middle income
## 4 ALB Europe & Central Asia Upper middle income
## 5 AND Europe & Central Asia High income
## 6 ARB
#Left join the region dataset into the mafemale dataset.
population<-mafemale%>%left_join(region,by="Country.Code")
## Warning: Column `Country.Code` joining factors with different levels,
## coercing to character vector
head(population)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Aruba ABW Population, female SP.POP.TOTL.FE.IN X1960
## 2 Afghanistan AFG Population, female SP.POP.TOTL.FE.IN X1960
## 3 Angola AGO Population, female SP.POP.TOTL.FE.IN X1960
## 4 Albania ALB Population, female SP.POP.TOTL.FE.IN X1960
## 5 Andorra AND Population, female SP.POP.TOTL.FE.IN X1960
## 6 Arab World ARB Population, female SP.POP.TOTL.FE.IN X1960
## male_female_population Region IncomeGroup
## 1 27637 Latin America & Caribbean High income
## 2 4346990 South Asia Low income
## 3 2900597 Sub-Saharan Africa Lower middle income
## 4 780595 Europe & Central Asia Upper middle income
## 5 NA Europe & Central Asia High income
## 6 45909546
#It can be check that the variable "region" groups the data into "East Asia & Pacific", "Europe & Central Asia","Latin America & Caribbean", "Middle East & North Africa" "North America", "South Asia", "Sub-Saharan Africa" and blank group without being classified into any of region of the world.
levels(population$Region)
## [1] "" "East Asia & Pacific"
## [3] "Europe & Central Asia" "Latin America & Caribbean"
## [5] "Middle East & North Africa" "North America"
## [7] "South Asia" "Sub-Saharan Africa"
# Inspect its structure, the data structure is shown below.
str(population)
## 'data.frame': 30096 obs. of 8 variables:
## $ Country.Name : Factor w/ 264 levels "Afghanistan",..: 11 1 6 2 5 8 250 9 10 4 ...
## $ Country.Code : chr "ABW" "AFG" "AGO" "ALB" ...
## $ Indicator.Name : chr "Population, female" "Population, female" "Population, female" "Population, female" ...
## $ Indicator.Code : chr "SP.POP.TOTL.FE.IN" "SP.POP.TOTL.FE.IN" "SP.POP.TOTL.FE.IN" "SP.POP.TOTL.FE.IN" ...
## $ year : chr "X1960" "X1960" "X1960" "X1960" ...
## $ male_female_population: num 27637 4346990 2900597 780595 NA ...
## $ Region : Factor w/ 8 levels "","East Asia & Pacific",..: 4 7 8 3 3 1 5 4 3 2 ...
## $ IncomeGroup : Factor w/ 5 levels "","High income",..: 2 3 4 5 2 1 2 5 4 5 ...
#The dataset can be filtered into different subsets according to different regions if the infomation of particular region needs to be investigated. In this analysis, the population information on North America will be analysed.
#Filter the dataset "population" and create the new subset only including the Northe America region, named "north_america".
north_america<-population%>%filter(Region=="North America")
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Bermuda BMU Population, female SP.POP.TOTL.FE.IN X1960
## 2 Canada CAN Population, female SP.POP.TOTL.FE.IN X1960
## 3 United States USA Population, female SP.POP.TOTL.FE.IN X1960
## 4 Bermuda BMU Population, female SP.POP.TOTL.FE.IN X1961
## 5 Canada CAN Population, female SP.POP.TOTL.FE.IN X1961
## 6 United States USA Population, female SP.POP.TOTL.FE.IN X1961
## male_female_population Region IncomeGroup
## 1 NA North America High income
## 2 8851741 North America High income
## 3 91167688 North America High income
## 4 NA North America High income
## 5 9040305 North America High income
## 6 92724776 North America High income
Summarise the types of variables and data structures, check the attributes in the data.
#Check the class of each variable in ???population???
sapply(population, class)
## Country.Name Country.Code Indicator.Name
## "factor" "character" "character"
## Indicator.Code year male_female_population
## "character" "character" "numeric"
## Region IncomeGroup
## "factor" "factor"
#Check the levels of factor variable "IncomeGroup" and order the levels of "IncomeGroup" according to the levels of the factor variable.
levels(population$IncomeGroup)
## [1] "" "High income" "Low income"
## [4] "Lower middle income" "Upper middle income"
population$IncomeGroup<-factor(population$IncomeGroup,levels = c("Low income", "Lower middle income", "Upper middle income","High income"), ordered=TRUE)
#The "year" variable is chacater variable, which should be changed into the date type in the future.
head(population$year)
## [1] "X1960" "X1960" "X1960" "X1960" "X1960" "X1960"
#The year vairable is charactor due to the reason that there is a captial letter "X" at the first place of the string, which needs to be delete and extract all numeric numbers in year variable.
population$year <- str_extract(population$year , "[0-9]+")
#Then convert the character vairable "year" into date type.
population$year<-as.Date(as.character(population$year), format ="%Y")
class(population$year)
## [1] "Date"
#now, the variable year is date type in dmy format.
typeof(population$male_female_population)
## [1] "double"
#The numeric "male_female_population" variable is in a double type variable.
#Check the class of each variable in ???population??? again.
sapply(population, class)
## $Country.Name
## [1] "factor"
##
## $Country.Code
## [1] "character"
##
## $Indicator.Name
## [1] "character"
##
## $Indicator.Code
## [1] "character"
##
## $year
## [1] "Date"
##
## $male_female_population
## [1] "numeric"
##
## $Region
## [1] "factor"
##
## $IncomeGroup
## [1] "ordered" "factor"
#It can be seen that all variable has been converted into the approporiate types they should be.
#The indicator.name variable denotes if the observation is the male or femalepopulation, which should be the factor variable rather than character. Therefore, we change the character type into factor type variable and name the levels of the variable with male and female.
population$Indicator.Name<-as.factor(population$Indicator.Name)
levels(population$Indicator.Name)[levels(population$Indicator.Name)=="Population, female"] <- "female"
levels(population$Indicator.Name)[levels(population$Indicator.Name)=="Population, male"] <- "male"
class(population$Indicator.Name)
## [1] "factor"
levels(population$Indicator.Name)
## [1] "female" "male"
#Finally see the structure of "population" dataframe.
str(population)
## 'data.frame': 30096 obs. of 8 variables:
## $ Country.Name : Factor w/ 264 levels "Afghanistan",..: 11 1 6 2 5 8 250 9 10 4 ...
## $ Country.Code : chr "ABW" "AFG" "AGO" "ALB" ...
## $ Indicator.Name : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...
## $ Indicator.Code : chr "SP.POP.TOTL.FE.IN" "SP.POP.TOTL.FE.IN" "SP.POP.TOTL.FE.IN" "SP.POP.TOTL.FE.IN" ...
## $ year : Date, format: "1960-06-24" "1960-06-24" ...
## $ male_female_population: num 27637 4346990 2900597 780595 NA ...
## $ Region : Factor w/ 8 levels "","East Asia & Pacific",..: 4 7 8 3 3 1 5 4 3 2 ...
## $ IncomeGroup : Ord.factor w/ 4 levels "Low income"<"Lower middle income"<..: 4 1 2 3 4 NA 4 3 2 3 ...
Check if the data conforms the tidy data principles.
#The target dataset is the region in North America. The tidy format issue of the dataset has ready been coped with. The dataset of population is in tidy format so does in its sub dataset, "north_america".
north_america<-population%>%filter(Region=="North America")
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Bermuda BMU female SP.POP.TOTL.FE.IN 1960-06-24
## 2 Canada CAN female SP.POP.TOTL.FE.IN 1960-06-24
## 3 United States USA female SP.POP.TOTL.FE.IN 1960-06-24
## 4 Bermuda BMU female SP.POP.TOTL.FE.IN 1961-06-24
## 5 Canada CAN female SP.POP.TOTL.FE.IN 1961-06-24
## 6 United States USA female SP.POP.TOTL.FE.IN 1961-06-24
## male_female_population Region IncomeGroup
## 1 NA North America High income
## 2 8851741 North America High income
## 3 91167688 North America High income
## 4 NA North America High income
## 5 9040305 North America High income
## 6 92724776 North America High income
sapply(north_america, class)
## $Country.Name
## [1] "factor"
##
## $Country.Code
## [1] "character"
##
## $Indicator.Name
## [1] "factor"
##
## $Indicator.Code
## [1] "character"
##
## $year
## [1] "Date"
##
## $male_female_population
## [1] "numeric"
##
## $Region
## [1] "factor"
##
## $IncomeGroup
## [1] "ordered" "factor"
#check the levels of countries included in the subset "north_america".
levels(droplevels(north_america$Country.Name))
## [1] "Bermuda" "Canada" "United States"
#it can be seen that there are 3 coutries included in the north america, "Bermuda", "Canada" and "United States".
#Then let's scan and deal with the missing value and inconsistencies issue as well as the outlier issue of the datasets.
colSums(is.na(north_america))
## Country.Name Country.Code Indicator.Name
## 0 0 0
## Indicator.Code year male_female_population
## 0 0 114
## Region IncomeGroup
## 0 0
#It can be seen that there are no missing value in all variables in the dataset except for the male_female_population variable with 114 missing data.
#scan the missing value of the population in each of the leveled countries in North America.
ber<-north_america%>% filter(Country.Name=="Bermuda")
canada<-north_america%>% filter(Country.Name=="Canada")
us<-north_america%>% filter(Country.Name=="United States")
sum(is.na(ber$male_female_population))
## [1] 114
sum(is.na(canada$male_female_population))
## [1] 0
sum(is.na(us$male_female_population))
## [1] 0
#it can be seen that there are no missing values in both United States and Canada, however, the population information in the Bermuda region has been totally missed.
#In this case, the original dataset contains no informarion on the population of the Bermuda, therefore, there is no way to recode these missing values but delate them.
#The information on the Bermuda has nothing to do with the population information, which needs to be delated.
#Drop the bermuda information
north_america<-na.omit(north_america)
#check the levels now in the North America.
levels(droplevels(north_america$Country.Name))
## [1] "Canada" "United States"
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 2 Canada CAN female SP.POP.TOTL.FE.IN 1960-06-24
## 3 United States USA female SP.POP.TOTL.FE.IN 1960-06-24
## 5 Canada CAN female SP.POP.TOTL.FE.IN 1961-06-24
## 6 United States USA female SP.POP.TOTL.FE.IN 1961-06-24
## 8 Canada CAN female SP.POP.TOTL.FE.IN 1962-06-24
## 9 United States USA female SP.POP.TOTL.FE.IN 1962-06-24
## male_female_population Region IncomeGroup
## 2 8851741 North America High income
## 3 91167688 North America High income
## 5 9040305 North America High income
## 6 92724776 North America High income
## 8 9221831 North America High income
## 9 94190774 North America High income
#Now, there are only 2 countries contained in North America dataset, the "Canada"" and "United States"" with no missing value in any varibale of any observations.
#As the same as the tidy format population dataset, north_america dataset is also in tidy format. Each variable must have its own column.Each observation must have its own row.Each value must have its own cell.
Create/mutate at two variables from the existing variables to demonstrate the total population and percentatge of male and female population each year for each country.
#More information of this dataset, like total population and percentage of male and female population each year for both Canada and united states, can be demonstrated by creating and transfroming new variable based on the existing male and female population information.
#Create a new dataset called "k1" to calculate the total the population each year for both Canada and united states,
k1<-aggregate(male_female_population~year+Country.Name,north_america,sum)
#rename the variable "male_female_population" into "total.population"
colnames(k1)[3] <- "total.population"
head(k1)
## year Country.Name total.population
## 1 1960-06-24 Canada 17909009
## 2 1961-06-24 Canada 18271000
## 3 1962-06-24 Canada 18614000
## 4 1963-06-24 Canada 18964000
## 5 1964-06-24 Canada 19325000
## 6 1965-06-24 Canada 19678000
#Now the total population for of male and female population each year for both Canada and united states are get.
#The left join the dataset "k1" to the dataset "north_america".
north_america<-north_america%>%left_join(k1)
## Joining, by = c("Country.Name", "year")
#The new variable "total.population" which represent the total population for each observation has been created.
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Canada CAN female SP.POP.TOTL.FE.IN 1960-06-24
## 2 United States USA female SP.POP.TOTL.FE.IN 1960-06-24
## 3 Canada CAN female SP.POP.TOTL.FE.IN 1961-06-24
## 4 United States USA female SP.POP.TOTL.FE.IN 1961-06-24
## 5 Canada CAN female SP.POP.TOTL.FE.IN 1962-06-24
## 6 United States USA female SP.POP.TOTL.FE.IN 1962-06-24
## male_female_population Region IncomeGroup total.population
## 1 8851741 North America High income 17909009
## 2 91167688 North America High income 180671000
## 3 9040305 North America High income 18271000
## 4 92724776 North America High income 183691000
## 5 9221831 North America High income 18614000
## 6 94190774 North America High income 186538000
#Create a new variable "population.percentage", which represent the percentage of male or female population consituting the total population of each country each year.
north_america<- north_america%>% mutate(population.percentage=male_female_population/total.population)
north_america$population.percentage <- paste(round((north_america$population.percentage)*100,digits=2),"%",sep="")
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Canada CAN female SP.POP.TOTL.FE.IN 1960-06-24
## 2 United States USA female SP.POP.TOTL.FE.IN 1960-06-24
## 3 Canada CAN female SP.POP.TOTL.FE.IN 1961-06-24
## 4 United States USA female SP.POP.TOTL.FE.IN 1961-06-24
## 5 Canada CAN female SP.POP.TOTL.FE.IN 1962-06-24
## 6 United States USA female SP.POP.TOTL.FE.IN 1962-06-24
## male_female_population Region IncomeGroup total.population
## 1 8851741 North America High income 17909009
## 2 91167688 North America High income 180671000
## 3 9040305 North America High income 18271000
## 4 92724776 North America High income 183691000
## 5 9221831 North America High income 18614000
## 6 94190774 North America High income 186538000
## population.percentage
## 1 49.43%
## 2 50.46%
## 3 49.48%
## 4 50.48%
## 5 49.54%
## 6 50.49%
#Now, new variables,"total.population" and Population.percentage, have been created representing total population and percentage of male and female population each year for both Canada and united states.
Scan and deal with the data for missing values, inconsistencies and obvious errors.
#Next, the consistencies of the information in each valuable needs to be checked.
#list the type of each variable in the dataset.
sapply(north_america, class)
## $Country.Name
## [1] "factor"
##
## $Country.Code
## [1] "character"
##
## $Indicator.Name
## [1] "factor"
##
## $Indicator.Code
## [1] "character"
##
## $year
## [1] "Date"
##
## $male_female_population
## [1] "numeric"
##
## $Region
## [1] "factor"
##
## $IncomeGroup
## [1] "ordered" "factor"
##
## $total.population
## [1] "numeric"
##
## $population.percentage
## [1] "character"
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Canada CAN female SP.POP.TOTL.FE.IN 1960-06-24
## 2 United States USA female SP.POP.TOTL.FE.IN 1960-06-24
## 3 Canada CAN female SP.POP.TOTL.FE.IN 1961-06-24
## 4 United States USA female SP.POP.TOTL.FE.IN 1961-06-24
## 5 Canada CAN female SP.POP.TOTL.FE.IN 1962-06-24
## 6 United States USA female SP.POP.TOTL.FE.IN 1962-06-24
## male_female_population Region IncomeGroup total.population
## 1 8851741 North America High income 17909009
## 2 91167688 North America High income 180671000
## 3 9040305 North America High income 18271000
## 4 92724776 North America High income 183691000
## 5 9221831 North America High income 18614000
## 6 94190774 North America High income 186538000
## population.percentage
## 1 49.43%
## 2 50.46%
## 3 49.48%
## 4 50.48%
## 5 49.54%
## 6 50.49%
#Convert charater variable, "Country.Name", into factor variable and check its consistency.
north_america$Country.Name<-as.factor(north_america$Country.Name)
levels(droplevels(north_america$Country.Name))
## [1] "Canada" "United States"
#Now the charater variable, "Country.Name", has been converted into factor variable with two levels "Canada" and "United States", which indicates all observations in these variable are consistent with no other other type of values.
#set rules to check if the year is restricted between 1960 and 2016.
#However, it is only desirable to keep the year part from the dmy format "year" variable. Therefere, the year extraction from date variable needs to be conducted.
north_america$year<-north_america$year %>% year()
Rule1 <- editset(c("year >= 1960", "year <= 2016"))
summary(violatedEdits(Rule1, north_america))
## No violations detected, 0 checks evaluated to NA
## NULL
#There is no violation to the rule 1.
#Check the consistency of factor variable "Country.Name".
levels(droplevels(north_america$Country.Name))
## [1] "Canada" "United States"
#the consistency of the country.name variable can be check jsut through checking the levels of the variable. All the obsetvations has assigned either Canada nor United States without any other levels or inconsistency values, like missing values.
class(north_america$Indicator.Name)
## [1] "factor"
#The indicator.name variable denotes if the observation is male or female population,which should be the factor variable.
class(north_america$Indicator.Name)
## [1] "factor"
levels(north_america$Indicator.Name)
## [1] "female" "male"
#Now the factor variable "indicator.name"" has two differen levels, male and female, which indicates no missing value and all observation in this variable has assigned a value.
#check consistency of IncomeGroup variable.
levels(droplevels(north_america$IncomeGroup))
## [1] "High income"
#All observation in north_america dataset has been classified into high income without any type of inconsistency.
colSums(is.na(north_america))
## Country.Name Country.Code Indicator.Name
## 0 0 0
## Indicator.Code year male_female_population
## 0 0 0
## Region IncomeGroup total.population
## 0 0 0
## population.percentage
## 0
#check consistency of character variables, Country.Code and Indicator.Code to see if all observations in these variables are consistency.
all(north_america$Country.Code %in% c("CAN", "USA"))
## [1] TRUE
#The result is true which means Country.Code variable is consistency and all observations in this variable are in either "CAN" or "USA" value.
all(north_america$Indicator.Code %in% c("SP.POP.TOTL.FE.IN", "SP.POP.TOTL.MA.IN"))
## [1] TRUE
#The result is true which means Indicator.Code variable is consistency and all observations in this variable are in either "SP.POP.TOTL.FE.IN" or "SP.POP.TOTL.MA.IN" value.
#After scan and deal with all the variables and observations, the north_america dataset now has no missing value and all variables are consistency.
#Check the outliers of the numeric variable "male_female_population". Due to the reason that there is only one numeric variable in original datset "male_female_population" needs to be conducting outliers analysis (the other 2 numeric variables "total.population" and "population.percentage" are all derived from "male_female_population" variable, so just univariate outlier detection), therefore, univariate outlier detection methods should be applied.
#The outliers of population size variable "male_female_population" in both Canada and United States has been checked.
canada<-north_america%>% filter(Country.Name=="Canada")
boxplot(male_female_population~Indicator.Name,data=canada,main="Box Plot of Population for Male and Female of Canada", ylab="Population", col = "grey")
us<-north_america%>% filter(Country.Name=="United States")
boxplot(male_female_population~Indicator.Name,data=us,main="Box Plot of Population for Male and Female of United States", ylab="Population", col = "grey")
#From the boxplot, there is no outliers in either United states and Canada male and female population dataset and all the observation performs normally.
#When the two variables year and population in that year are considered, it is shown that there are 3 outliers shown in Chi-Square Q-Q plot.
#However, due to the reason that the data in this analysis is the census of the population of different country,which is relatively accurate data but not baised sample size, therefore, the outliers should not be handled.
#The reason that contributes these outliers might probably by some socal factors, such as policy reform on immigration and child birth social walfare. Handling these outliers probably will bring more inaccuracy from the real data.
#Split the data into subsets that contains Canada-female, Canada-male, US-female, and US-male population information.
north_america %>%group_by(Indicator.Name, Country.Name) %>% hist(north_america$male_female_population)
## Warning in if (length(unique(w)) >= n.unique) {: the condition has length >
## 1 and only the first element will be used
## Warning in if (length(unique(w)) >= n.unique) {: the condition has length >
## 1 and only the first element will be used
## Warning in if (length(unique(w)) >= n.unique) {: the condition has length >
## 1 and only the first element will be used
b<-split(north_america, list(north_america$Country.Name, north_america$Indicator.Name),drop=TRUE)
Canada.female<-b$Canada.female
Canada.male<-b$Canada.male
US.male<-b$`United States.male`
US.female<-b$`United States.female`
#Demonstrate the histogram of these subset of the population.
hist(Canada.female$male_female_population)
hist(Canada.male$male_female_population)
hist(US.male$male_female_population)
hist(US.female$male_female_population)
#It can be seen that population in either male and female in both Canada and United States are obviously not normally distributed.
#Due to the fact that population has high correlation with the past, therefore, it is possible to make forecast of the future population, which requires nomally distributed dataset.
#Hence, a boxcox transformation variable are required for the forecasting to make population normally distributed.
#Filter the Canada female data subset and create a new variable named "transformed.population" by boxcox transformation.
Canada.female<-north_america%>%filter(Indicator.Name=="female",Country.Name=="Canada")
lambda=BoxCox.lambda(Canada.female$male_female_population)
Canada.female<-Canada.female%>%mutate(transformed.population=BoxCox(male_female_population,lambda = lambda))
#Filter the Canada female data subset and create a new variable named "transformed.population" by boxcox transformation.
Canada.male<-north_america%>%filter(Indicator.Name=="male",Country.Name=="Canada")
lambda=BoxCox.lambda(Canada.male$male_female_population)
Canada.male<-Canada.male%>%mutate(transformed.population=BoxCox(male_female_population,lambda = lambda))
#Filter the United States male data subset and create a new variable named "transformed.population" by boxcox transformation.
US.male<-north_america%>%filter(Indicator.Name=="male",Country.Name=="United States")
lambda=BoxCox.lambda(US.male$male_female_population)
US.male<-US.male%>%mutate(transformed.population=BoxCox(male_female_population,lambda = lambda))
#Filter the United States female data subset and create a new variable named "transformed.population" by boxcox transformation.
US.female<-north_america%>%filter(Indicator.Name=="female",Country.Name=="United States")
lambda=BoxCox.lambda(US.female$male_female_population)
US.female<-US.female%>%mutate(transformed.population=BoxCox(male_female_population,lambda = lambda))
#Combine these four data subsets together by merging the rows of the four dataset.
north_america<-bind_rows(Canada.male,Canada.female,US.male,US.female)
#sort and order the dataset by ascending order of the year.
north_america<-north_america[order(north_america$year,north_america$Country.Name),]
#Finally the post-processed tidy dataset "north_america" has been obtained.
head(north_america)
## Country.Name Country.Code Indicator.Name Indicator.Code year
## 1 Canada CAN male SP.POP.TOTL.MA.IN 1960
## 58 Canada CAN female SP.POP.TOTL.FE.IN 1960
## 115 United States USA male SP.POP.TOTL.MA.IN 1960
## 172 United States USA female SP.POP.TOTL.FE.IN 1960
## 2 Canada CAN male SP.POP.TOTL.MA.IN 1961
## 59 Canada CAN female SP.POP.TOTL.FE.IN 1961
## male_female_population Region IncomeGroup total.population
## 1 9057268 North America High income 17909009
## 58 8851741 North America High income 17909009
## 115 89503312 North America High income 180671000
## 172 91167688 North America High income 180671000
## 2 9230695 North America High income 18271000
## 59 9040305 North America High income 18271000
## population.percentage transformed.population
## 1 50.57% 564047.43
## 58 49.43% 6899526.15
## 115 49.54% 10246.06
## 172 50.46% 5669412.79
## 2 50.52% 572821.56
## 59 49.48% 7044034.35