Note

-In base R, character variables are often presented as factor

Resources

-Download the following csv files from the gapminder web site (https://www.gapminder.org).

  1. “cholesterol_fat_in_blood_women_mmolperl.csv”
  2. “body_mass_index_infant.mortality_women_kgperm2.csv”
  3. “blood_pressure_sbp_women_mmhg.csv”
  4. “children_per_woman_total_fertility.csv”
  5. “contraceptive_use_percent_of_women_ages_15_49.csv”
  6. “breast_cancer_number_of_female_deaths.csv”
  7. “infant_mortality_rate_per_1000_births.csv”
  8. “maternal_deaths_total_number.csv”
  9. “maternal_mortality_ratio_per_100000_live_births.csv”

Import the csv files to create following datasets

cholesterol_fat_in_blood_women_mmolperl -> fm.tc -> fm.tc.long

fm.tc <- read.csv("gpmlib/cholesterol_fat_in_blood_women_mmolperl.csv")

fm.tc[c(1, 100, 189), c(1, 10,11, 20, 21, 30 )]
country X1988 X1989 X1998 X1999 X2008
1 Afghanistan 4.58 4.57 4.37 4.36 4.24
100 Madagascar 4.52 4.50 4.40 4.39 4.30
189 Zimbabwe 4.59 4.57 4.43 4.43 4.33
dim(fm.tc)
## [1] 189  30
fm.tc.long <-reshape(fm.tc, idvar = "country", varying=2:30, sep = "", direction = "long") 

rownames(fm.tc.long) <- c()

names(fm.tc.long) <- c("country", "year", "tc")

head(fm.tc.long)
country year tc
Afghanistan 1980 4.64
Albania 1980 5.04
Algeria 1980 4.98
Andorra 1980 6.13
Angola 1980 4.79
Antigua and Barbuda 1980 5.01
dim(fm.tc.long)
## [1] 5481    3

body_mass_index_bmi_women_kgperm2 -> fm.bmi -> fm.bmi.long

fm.bmi <-read.csv("gpmlib/body_mass_index_bmi_women_kgperm2.csv")

fm.bmi[c(1, 100, 189), c(1, 10,11, 20,21, 30)]
country X1988 X1989 X1998 X1999 X2008
1 Afghanistan 20.7 20.7 20.6 20.6 21.1
100 Madagascar 20.4 20.5 20.6 20.6 20.7
189 Zimbabwe 25.0 24.9 24.7 24.7 24.6
dim(fm.bmi)
## [1] 189  30
fm.bmi.long <-reshape(fm.bmi, idvar = "country", varying=2:30, sep = "", direction = "long") 

rownames(fm.bmi.long) <- c()

names(fm.bmi.long) <- c("country", "year", "bmi")

head(fm.bmi.long)
country year bmi
Afghanistan 1980 20.4
Albania 1980 25.2
Algeria 1980 23.7
Andorra 1980 25.7
Angola 1980 20.1
Antigua and Barbuda 1980 24.2
dim(fm.bmi.long)
## [1] 5481    3

blood_pressure_sbp_women_mmhg -> fm.sbp -> fm.sbp.long

fm.sbp <- read.csv("gpmlib/blood_pressure_sbp_women_mmhg.csv")

fm.sbp[c(1, 100, 189), c(1, 10,11, 20,21, 30)]
country X1988 X1989 X1998 X1999 X2008
1 Afghanistan 124 124 124 124 125
100 Madagascar 128 128 130 130 133
189 Zimbabwe 130 130 131 131 132
dim(fm.sbp)
## [1] 189  30
fm.sbp.long <-reshape(fm.sbp, idvar = "country", varying=2:30, sep = "", direction = "long") 

rownames(fm.sbp.long) <- c()

names(fm.sbp.long) <- c("country", "year", "sbp")

head(fm.sbp.long)
country year sbp
Afghanistan 1980 122
Albania 1980 132
Algeria 1980 131
Andorra 1980 137
Angola 1980 130
Antigua and Barbuda 1980 126
dim(fm.sbp.long)
## [1] 5481    3

children_per_woman_total_fertility -> fm.fertility -> fm.fertility.long

fm.fertility <- read.csv("gpmlib/children_per_woman_total_fertility.csv")

fm.fertility[c(1, 100, 184), c(1, 100,101, 200,201, 220)]
country X1898 X1899 X1998 X1999 X2018
1 Afghanistan 7.00 7.00 7.62 7.57 4.33
100 Malaysia 5.45 5.45 3.04 2.91 2.00
184 Zimbabwe 6.75 6.75 4.16 4.10 3.61
dim(fm.fertility)
## [1] 184 220
fm.fertility.long <-reshape(fm.fertility, idvar = "country", varying=2:220, sep = "", direction = "long") 

rownames(fm.fertility.long) <- c()

names(fm.fertility.long) <- c("country", "year", "total_fertility")

head(fm.fertility.long)
country year total_fertility
Afghanistan 1800 7.00
Albania 1800 4.60
Algeria 1800 6.99
Angola 1800 6.93
Antigua and Barbuda 1800 5.00
Argentina 1800 6.80
dim(fm.fertility.long)
## [1] 40296     3

contraceptive_use_percent_of_women_ages_15_49.csv -> fm.contracept.use -> fm.contracept.use.long

fm.contracept.use <- read.csv("gpmlib/contraceptive_use_percent_of_women_ages_15_49.csv")

fm.contracept.use[c(1, 100, 185), c(1, 52:58)]
country X2011 X2012 X2013 X2014 X2015 X2016 X2017
1 Afghanistan 21.2 NA NA NA 22.5 NA NA
100 Malta NA NA NA NA NA NA NA
185 Zimbabwe 58.5 NA NA 66.9 66.8 NA NA
dim(fm.contracept.use)
## [1] 185  58
fm.contracept.use.long <-reshape(fm.contracept.use, idvar = "country", varying=2:58, sep = "", direction = "long") 

rownames(fm.contracept.use.long) <- c()

names(fm.contracept.use.long) <- c("country", "year", "contraceptive_use_percent")

tail(fm.contracept.use.long)
country year contraceptive_use_percent
10540 Vanuatu 2017 NA
10541 Venezuela 2017 NA
10542 Vietnam 2017 NA
10543 Yemen 2017 NA
10544 Zambia 2017 NA
10545 Zimbabwe 2017 NA
dim(fm.contracept.use.long)
## [1] 10545     3

breast_cancer_number_of_female_deaths -> fm.brstc.death -> fm.brstc.death.long

fm.brstc.death <- read.csv("gpmlib/breast_cancer_number_of_female_deaths.csv")

fm.brstc.death[c(1, 100, 187), c(1, 24:28)]
country X2012 X2013 X2014 X2015 X2016
1 Afghanistan 2340 2420 2510 2600 2690
100 Madagascar 789 823 856 892 922
187 Zimbabwe 764 770 819 841 870
dim(fm.brstc.death)
## [1] 187  28
fm.brstc.death.long <-reshape(fm.brstc.death, idvar = "country", varying=2:28, sep = "", direction = "long") 

rownames(fm.brstc.death.long) <- c()

names(fm.brstc.death.long) <- c("country", "year", "breast_cancer_deaths")

head(fm.brstc.death.long)
country year breast_cancer_deaths
Afghanistan 1990 761.00
Albania 1990 100.00
Algeria 1990 701.00
Andorra 1990 9.80
Angola 1990 420.00
Antigua and Barbuda 1990 5.71
dim(fm.brstc.death.long)
## [1] 5049    3

infant_mortality_rate_per_1000_births.csv -> infant.mortality -> infant.mortality.long

infant.mortality <- read.csv("gpmlib/infant_mortality_rate_per_1000_births.csv")

infant.mortality[c(1, 100, 194), c(1, 100, 101, 200, 201, 217)]
country X1898 X1899 X1998 X1999 X2015
1 Afghanistan NA NA 98.9 97.2 66.3
100 Macedonia, FYR NA NA 16.1 15.0 4.8
194 Zimbabwe NA NA 63.3 63.5 46.6
dim(infant.mortality)
## [1] 194 217
infant.mortality.long <-reshape(infant.mortality, idvar = "country", varying=2:217, sep = "", direction = "long") 

names(infant.mortality.long) <- c("country", "year", "infant_mortality_rate")

rownames(infant.mortality.long) <- NULL

tail(infant.mortality.long)
country year infant_mortality_rate
41899 Vanuatu 2015 23.1
41900 Venezuela 2015 12.9
41901 Vietnam 2015 17.3
41902 Yemen 2015 33.8
41903 Zambia 2015 43.3
41904 Zimbabwe 2015 46.6
dim(infant.mortality.long)
## [1] 41904     3

maternal_deaths_total_number.csv -> maternal.death -> maternal.death.long

maternal.death <- read.csv("gpmlib/maternal_deaths_total_number.csv")

maternal.death[c(1, 100, 183), c(1, 100,101, 200,201, 215)]
country X1898 X1899 X1998 X1999 X2013
1 Afghanistan NA NA NA NA 4200
100 Maldives NA NA NA NA 2
183 Zimbabwe NA NA NA NA 2100
dim(maternal.death)
## [1] 183 215
maternal.death.long <-reshape(maternal.death, idvar = "country", varying=2:215, sep = "", direction = "long") 

rownames(maternal.death.long) <- c()

names(maternal.death.long) <- c("country", "year", "maternal_deaths")

tail(maternal.death.long)
country year maternal_deaths
39157 Vanuatu 2013 6
39158 Venezuela 2013 650
39159 Vietnam 2013 690
39160 Yemen 2013 2100
39161 Zambia 2013 1800
39162 Zimbabwe 2013 2100
dim(maternal.death.long)
## [1] 39162     3

maternal_mortality_ratio_per_100000_live_births.csv -> maternal.mortality.ratio -> maternal.mortality.ratio.long

maternal.mortality.ratio <- read.csv("gpmlib/maternal_mortality_ratio_per_100000_live_births.csv")

maternal.mortality.ratio[c(1, 100, 187), c(1, 100, 101, 200, 201, 215)]
country X1898 X1899 X1998 X1999 X2013
1 Afghanistan NA NA NA NA 400
100 Madagascar NA NA NA NA 440
187 Zimbabwe NA NA NA NA 470
dim(maternal.mortality.ratio)
## [1] 187 215
maternal.mortality.ratio.long <-reshape(maternal.mortality.ratio, idvar = "country", varying=2:215, sep = "", direction = "long") 
rownames(maternal.mortality.ratio.long) <- c()

names(maternal.mortality.ratio.long) <- c("country", "year", "maternal_mortality_ratio")

tail(maternal.mortality.ratio.long)
country year maternal_mortality_ratio
40013 Vanuatu 2013 86
40014 Venezuela 2013 110
40015 Vietnam 2013 49
40016 Yemen 2013 270
40017 Zambia 2013 280
40018 Zimbabwe 2013 470
dim(maternal.mortality.ratio.long)
## [1] 40018     3

Inner join

Output: Total rows: 3382 and Total columns: 11

gpm.i <- fm.tc.long %>% 
        merge(fm.bmi.long) %>%
        merge(fm.sbp.long) %>%
        merge(fm.tc.long) %>%
        merge(fm.fertility.long)%>%
        merge(fm.contracept.use.long)%>% 
        merge(fm.brstc.death.long)%>%
        merge(infant.mortality.long)%>%
        merge(maternal.death.long) %>%
        merge(maternal.mortality.ratio.long)

head(gpm.i, n=3)
country year tc bmi sbp total_fertility contraceptive_use_percent breast_cancer_deaths infant_mortality_rate maternal_deaths maternal_mortality_ratio
Afghanistan 1990 4.55 20.7 124 7.47 NA 761 123 7520 1200
Afghanistan 1991 4.53 20.7 124 7.48 NA 817 118 NA NA
Afghanistan 1992 4.51 20.7 124 7.50 NA 894 114 NA NA
dim(gpm.i)
## [1] 3382   11

complete cases or omit NA

Output: Total rows: 177 and Total columns: 11

cs.i <- complete.cases(gpm.i)

head(gpm.i[cs.i,], n=3)
country year tc bmi sbp total_fertility contraceptive_use_percent breast_cancer_deaths infant_mortality_rate maternal_deaths maternal_mortality_ratio
11 Afghanistan 2000 4.33 20.6 124 7.49 5.3 1470 95.4 10700 1100
16 Afghanistan 2005 4.26 20.8 125 6.83 13.6 1880 84.4 7900 730
30 Albania 2000 4.92 25.1 129 2.16 57.5 150 23.2 14 28
dim(gpm.i[cs.i,])
## [1] 177  11
head(na.omit(gpm.i), n=3)
country year tc bmi sbp total_fertility contraceptive_use_percent breast_cancer_deaths infant_mortality_rate maternal_deaths maternal_mortality_ratio
11 Afghanistan 2000 4.33 20.6 124 7.49 5.3 1470 95.4 10700 1100
16 Afghanistan 2005 4.26 20.8 125 6.83 13.6 1880 84.4 7900 730
30 Albania 2000 4.92 25.1 129 2.16 57.5 150 23.2 14 28
dim(na.omit(gpm.i))
## [1] 177  11

exclude 3 variables with many missing values from the join

Output: Total rows: 3477 and Total columns: 8

gpm.i.1 <- fm.tc.long %>% 
        merge(fm.bmi.long) %>%
        merge(fm.sbp.long) %>%
        merge(fm.fertility.long) %>%
        merge(fm.brstc.death.long) %>%
        merge(infant.mortality.long)

head(gpm.i.1, n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
Afghanistan 1990 4.55 20.7 124 7.47 761 123
Afghanistan 1991 4.53 20.7 124 7.48 817 118
Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(gpm.i.1)
## [1] 3477    8

complete cases or omit NA

Output: Total rows: 3477 and Total columns: 8

cs.i.1 <- complete.cases(gpm.i.1)

head(gpm.i.1[cs.i.1,], n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
Afghanistan 1990 4.55 20.7 124 7.47 761 123
Afghanistan 1991 4.53 20.7 124 7.48 817 118
Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(gpm.i.1[cs.i.1,])
## [1] 3477    8
head(na.omit(gpm.i.1), n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
Afghanistan 1990 4.55 20.7 124 7.47 761 123
Afghanistan 1991 4.53 20.7 124 7.48 817 118
Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(na.omit(gpm.i))
## [1] 177  11

Left join

Output: Total rows: 5481 and Total columns: 11

gpm.l <-fm.tc.long %>% merge(fm.bmi.long, all.x = TRUE) %>%
        merge(fm.sbp.long, all.x = TRUE) %>%
        merge(fm.tc.long, all.x = TRUE) %>%
        merge(fm.fertility.long, all.x = TRUE)%>%
        merge(fm.contracept.use.long, all.x = TRUE)%>% 
        merge(fm.brstc.death.long, all.x = TRUE)%>%
        merge(infant.mortality.long, all.x = TRUE)%>%
        merge(maternal.death.long, all.x = TRUE) %>%
        merge(maternal.mortality.ratio.long, all.x = TRUE)

head(gpm.l, n=3)
country year tc bmi sbp total_fertility contraceptive_use_percent breast_cancer_deaths infant_mortality_rate maternal_deaths maternal_mortality_ratio
Afghanistan 1980 4.64 20.4 122 7.45 NA NA 166 10600 1640
Afghanistan 1981 4.64 20.5 122 7.45 NA NA 162 NA NA
Afghanistan 1982 4.63 20.5 123 7.45 NA NA 158 NA NA
dim(gpm.l)
## [1] 5481   11

exclude 3 variables with many missing values from the join

Output: Total rows: 5481 and Total columns: 8

gpm.l.1 <-fm.tc.long %>% merge(fm.bmi.long, all.x = TRUE) %>%
        merge(fm.sbp.long, all.x = TRUE) %>%
        merge(fm.tc.long, all.x = TRUE) %>%
        merge(fm.fertility.long, all.x = TRUE)%>%
        merge(fm.brstc.death.long, all.x = TRUE)%>%
        merge(infant.mortality.long, all.x = TRUE)

head(gpm.l.1, n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
Afghanistan 1980 4.64 20.4 122 7.45 NA 166
Afghanistan 1981 4.64 20.5 122 7.45 NA 162
Afghanistan 1982 4.63 20.5 123 7.45 NA 158
dim(gpm.i.1)
## [1] 3477    8

complete cases or omit NA

Output: Total rows: 3477 and Total columns: 8

cs.l.1 <- complete.cases(gpm.l.1)

head(gpm.l.1[cs.l.1,], n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
11 Afghanistan 1990 4.55 20.7 124 7.47 761 123
12 Afghanistan 1991 4.53 20.7 124 7.48 817 118
13 Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(gpm.l.1[cs.l.1, ])
## [1] 3477    8
head(na.omit(gpm.l.1), n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
11 Afghanistan 1990 4.55 20.7 124 7.47 761 123
12 Afghanistan 1991 4.53 20.7 124 7.48 817 118
13 Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(na.omit(gpm.i))
## [1] 177  11

Right join

Output: Total rows: 41904 and Total columns: 11

gpm.r <- merge(fm.tc.long, fm.bmi.long, all.y = TRUE) %>%
        merge(fm.sbp.long, all.y = TRUE) %>%
        merge(fm.tc.long, all.y = TRUE) %>%
        merge(fm.fertility.long, all.y = TRUE)%>%
        merge(fm.contracept.use.long, all.y = TRUE)%>% 
        merge(fm.brstc.death.long, all.y = TRUE)%>%
        merge(infant.mortality.long, all.y = TRUE)%>%
        merge(maternal.death.long, all.y = TRUE) %>%
        merge(maternal.mortality.ratio.long, all.y = TRUE)

head(gpm.r, n=3)
country year tc bmi sbp total_fertility contraceptive_use_percent breast_cancer_deaths infant_mortality_rate maternal_deaths maternal_mortality_ratio
Afghanistan 1800 NA NA NA NA NA NA NA NA NA
Afghanistan 1801 NA NA NA NA NA NA NA NA NA
Afghanistan 1802 NA NA NA NA NA NA NA NA NA
dim(gpm.r)
## [1] 40018    11

exclude 3 variables with many missing values from the join

Output: Total rows: 41904 and Total columns: 8

gpm.r.1 <- merge(fm.tc.long, fm.bmi.long, all.y = TRUE) %>%
        merge(fm.sbp.long, all.y = TRUE) %>%
        merge(fm.tc.long, all.y = TRUE) %>%
        merge(fm.fertility.long, all.y = TRUE)%>%
        merge(fm.brstc.death.long, all.y = TRUE)%>%
        merge(infant.mortality.long, all.y = TRUE)

head(gpm.r.1, n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
Afghanistan 1800 NA NA NA NA NA NA
Afghanistan 1801 NA NA NA NA NA NA
Afghanistan 1802 NA NA NA NA NA NA
dim(gpm.r.1)
## [1] 41904     8

complete cases or omit NA

Output: Total rows: 3477 and Total columns: 8

cs.r.1 <- complete.cases(gpm.r.1)

head(gpm.r.1[cs.r.1,], n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
191 Afghanistan 1990 4.55 20.7 124 7.47 761 123
192 Afghanistan 1991 4.53 20.7 124 7.48 817 118
193 Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(gpm.r.1[cs.r.1,])
## [1] 3477    8
head(na.omit(gpm.r.1), n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
191 Afghanistan 1990 4.55 20.7 124 7.47 761 123
192 Afghanistan 1991 4.53 20.7 124 7.48 817 118
193 Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(na.omit(gpm.r.1))
## [1] 3477    8

Full join

Output: Total rows: 42469 and Total columns: 11

gpm.f <- merge(fm.tc.long, fm.bmi.long, all.x = TRUE, all.y = TRUE) %>%
        merge(fm.sbp.long, all.x = TRUE, all.y = TRUE) %>%
        merge(fm.tc.long, all.x = TRUE, all.y = TRUE) %>%
        merge(fm.fertility.long, all.x = TRUE, all.y = TRUE)%>%
        merge(fm.contracept.use.long, all.x = TRUE, all.y = TRUE)%>% 
        merge(fm.brstc.death.long, all.x = TRUE, all.y = TRUE)%>%
        merge(infant.mortality.long, all.x = TRUE, all.y = TRUE)%>%
        merge(maternal.death.long, all.x = TRUE, all.y = TRUE) %>%
        merge(maternal.mortality.ratio.long, all.x = TRUE, all.y = TRUE)

head(gpm.f, n=3)
country year tc bmi sbp total_fertility contraceptive_use_percent breast_cancer_deaths infant_mortality_rate maternal_deaths maternal_mortality_ratio
Afghanistan 1800 NA NA NA 7 NA NA NA NA NA
Afghanistan 1801 NA NA NA 7 NA NA NA NA NA
Afghanistan 1802 NA NA NA 7 NA NA NA NA NA
dim(gpm.f)
## [1] 42469    11

exclude 3 variables with many missing values from the join

Output: Total rows: 42459 and Total columns: 8

gpm.f.1 <- merge(fm.tc.long, fm.bmi.long, all.x = TRUE, all.y = TRUE) %>%
        merge(fm.sbp.long, all.x = TRUE, all.y = TRUE) %>%
        merge(fm.tc.long, all.x = TRUE, all.y = TRUE) %>%
        merge(fm.fertility.long, all.x = TRUE, all.y = TRUE)%>%
        merge(fm.brstc.death.long, all.x = TRUE, all.y = TRUE)%>%
        merge(infant.mortality.long, all.x = TRUE, all.y = TRUE)

head(gpm.f.1, n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
Afghanistan 1800 NA NA NA 7 NA NA
Afghanistan 1801 NA NA NA 7 NA NA
Afghanistan 1802 NA NA NA 7 NA NA
dim(gpm.f.1)
## [1] 42459     8

complete cases or omit NA

Output: Total rows: 3477 and Total columns: 8

cs.f.1 <- complete.cases(gpm.f.1)

head(gpm.f.1[cs.f.1,], n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
191 Afghanistan 1990 4.55 20.7 124 7.47 761 123
192 Afghanistan 1991 4.53 20.7 124 7.48 817 118
193 Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(gpm.f.1[cs.f.1,])
## [1] 3477    8
head(na.omit(gpm.f.1), n=3)
country year tc bmi sbp total_fertility breast_cancer_deaths infant_mortality_rate
191 Afghanistan 1990 4.55 20.7 124 7.47 761 123
192 Afghanistan 1991 4.53 20.7 124 7.48 817 118
193 Afghanistan 1992 4.51 20.7 124 7.50 894 114
dim(na.omit(gpm.f.1))
## [1] 3477    8