suppressPackageStartupMessages({
library(tidyverse)
library(knitr)
library(Hmisc)
library(reshape2)
library(class)
})
Below, we import and explore the life expectancy data provided by the WHO (LifeExpectancyData.csv) using various methods and make some observations:
head().str(). Of the 22 features, only 2 are non-numeric.hist.data.frame() from Hmisc. There is a wide variety in distributions among the different features. Few appear to be normal.# Import CSV
df <- read.csv("https://s3.us-east-2.amazonaws.com/artificium.us/datasets/LifeExpectancyData.csv",
stringsAsFactors = FALSE)
# Inspect first 5 rows
head(df, n=5)
# Inspect the structure
str(df)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : chr "Developing" "Developing" "Developing" "Developing" ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness..1.19.years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.5.9.years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
# Inspect the distribution, exclude country name
hist.data.frame(df[2:13], n.unique=2)
hist.data.frame(df[14:22], n.unique=2)
# Check for missing values
missing <- apply(df, MARGIN=2, function(x) sum(is.na(x))) %>%
as.data.frame() %>%
rename(NumMissing = 1) %>%
mutate(PctMissing = NumMissing / nrow(df) * 100) %>%
arrange(-PctMissing)
kable(missing)
| NumMissing | PctMissing | |
|---|---|---|
| Population | 652 | 22.1919673 |
| Hepatitis.B | 553 | 18.8223281 |
| GDP | 448 | 15.2484683 |
| Total.expenditure | 226 | 7.6923077 |
| Alcohol | 194 | 6.6031314 |
| Income.composition.of.resources | 167 | 5.6841389 |
| Schooling | 163 | 5.5479918 |
| BMI | 34 | 1.1572498 |
| thinness..1.19.years | 34 | 1.1572498 |
| thinness.5.9.years | 34 | 1.1572498 |
| Polio | 19 | 0.6466984 |
| Diphtheria | 19 | 0.6466984 |
| Life.expectancy | 10 | 0.3403676 |
| Adult.Mortality | 10 | 0.3403676 |
| Country | 0 | 0.0000000 |
| Year | 0 | 0.0000000 |
| Status | 0 | 0.0000000 |
| infant.deaths | 0 | 0.0000000 |
| percentage.expenditure | 0 | 0.0000000 |
| Measles | 0 | 0.0000000 |
| under.five.deaths | 0 | 0.0000000 |
| HIV.AIDS | 0 | 0.0000000 |
# Inspect pairs whose spearman correlation > 0.8
cor(df[c(-1,-2,-3)], use="pairwise.complete.obs", method="spearman") %>%
melt() %>%
filter(value > 0.8 | value < -0.8,
Var1 != Var2) %>%
group_by(value) %>%
slice_head(n=1) %>%
arrange(-abs(value)) %>%
rename(SpearmanCor = value) %>%
kable()
| Var1 | Var2 | SpearmanCor |
|---|---|---|
| under.five.deaths | infant.deaths | 0.9932012 |
| thinness.5.9.years | thinness..1.19.years | 0.9470423 |
| Diphtheria | Polio | 0.9210708 |
| Schooling | Income.composition.of.resources | 0.9005297 |
| Income.composition.of.resources | Life.expectancy | 0.8657015 |
| Diphtheria | Hepatitis.B | 0.8173095 |
| Schooling | Life.expectancy | 0.8135410 |
| GDP | percentage.expenditure | 0.8067070 |
To further investigate the missing values, we check the distribution of the features with more than 10% missing, colored by the feature we wish to build a predictive model for (Country Status).
map(rownames(missing[missing$PctMissing>10,]), ~ {
g <- df %>%
ggplot(aes(x=!!sym(.x), fill=Status)) +
geom_density(alpha=.2) +
theme_bw()
if (.x %in% c("Population", "GDP")) {
g + scale_x_log10()
} else {
g
}
})
The dependency of these features on the target variable is taken into account in the next phase when we prepare the data.
Since Hepatitis and GDP appear extremely dependent on Country status, we decide to impute the median by country status. Otherwise, the value would be skewed toward whichever status is more represented (developing countries are 4x more represented than developed countries), and this would negatively impact the predictive power of the model. We impute Population with median normally. We choose the median instead of the mean to be less skewed by outliers. Lastly, as we have a decently large dataset, we can afford to omit the remaining observations with a missing value.
df <- df %>%
# Impute population to overall median
mutate(Population = ifelse(is.na(Population), median(Population, na.rm=TRUE), Population)) %>%
# Impute the rest by median within country status
group_by(Status) %>%
mutate(Hepatitis.B = ifelse(is.na(Hepatitis.B), median(Hepatitis.B, na.rm=TRUE), Hepatitis.B),
GDP = ifelse(is.na(GDP), median(GDP, na.rm=TRUE), GDP),
Schooling = ifelse(is.na(Schooling), median(Schooling, na.rm=TRUE), Schooling)) %>%
# Exclude the rest, small portion of dataset
na.omit
map(rownames(missing[missing$PctMissing>10,]), ~ {
g <- df %>%
ggplot(aes(x=!!sym(.x), fill=Status)) +
geom_density(alpha=.2) +
theme_bw()
if (.x %in% c("Population", "GDP")) {
g + scale_x_log10()
} else {
g
}
})
The graphs above show the newly imputed dataset, understandably with a higher peak at the median value, but importantly the distinction between the two statuses are preserved. We are now left with 2556 datapoints after handling missing values.
Next, we create a derived attribute for per capital mortality and compare the rate between Developing versus Developed countries. The source file represents adult deaths per 1000 and infant deaths in absolute count, so we calculate the per-capita mortality rate as follows: Adult.Mortality/1000 + infant.deaths/Population. Error bars represent mean +/- standard deviation within each country status group.
# Calculate mortality per capita
df$Mortality.Rate = df$Adult.Mortality/1000 + df$infant.deaths/df$Population
df %>%
# Descriptive statistics by Status type
group_by(Status) %>%
dplyr::summarize(Mortality.Rate_avg = mean(Mortality.Rate),
Mortality.Rate_sd = sd(Mortality.Rate)) %>%
ggplot(aes(x=Status, y=Mortality.Rate_avg, fill=Status)) +
geom_bar(stat="identity") +
geom_errorbar(aes(
ymin = Mortality.Rate_avg - Mortality.Rate_sd,
ymax = Mortality.Rate_avg + Mortality.Rate_sd,
width=.1
)) +
theme_bw() +
scale_y_continuous(labels = scales::percent) +
labs(title = "Comparison of average mortality rate in developed vs developing countries",
y = "Average Mortality Rate")
As the calculated mean is a single value and not informative of the distribution of points, and the error bars hint at the noisiness of the data, we also opt to display the data as a boxplot, where the box represents the IQR, the center line represents the median, the whiskers represent the range, and the points represent potential outliers that are beyond 1.5*IQR from the median.
df %>%
ggplot(aes(x=Status, y=Mortality.Rate, fill=Status)) +
geom_boxplot() +
scale_y_continuous(labels = scales::percent) +
theme_bw() +
labs(title = "Comparison of per-capita mortality rates in developed vs developing countries",
y = "Mortality Per Capita")
From both plots, we can observe that both the statistical mean and the median for Per Capita Mortality is higher in Developing countries than in Developed countries. This aligns with our pre-conceived notions about healthcare access in developed versus developing countries, but the noisiness of the data, as indicated by the long whiskers and large IQR in the second plot and the error bars in the first plot, prompts for further analysis to determine whether this difference is statistically significant.
To do this, we first check for normality both visually and with a test on both datasets, shown below.
map(unique(df$Status), ~ {
df %>%
filter(Status == .x) %>%
ggplot(aes(x=Mortality.Rate)) +
geom_histogram() +
theme_bw() +
scale_x_continuous(labels = scales::percent) +
labs(title = paste("Distribution of Per Capita Mortality Rates in", .x, "Countries"),
subtitle = paste("Countries represented:", n_distinct(filter(df, Status==.x)$Country),
"| Year range:", min(filter(df, Status==.x)$Year), "-",
max(filter(df, Status==.x)$Year)),
x = "Per Capita Mortality Rate",
y = "Count")
})
## [[1]]
##
## [[2]]
We use a Shapiro Wilk normality test below on both original and log-transformed mortality rates by country status to check for normality. The sample size is appropriate for this test as it is between 3 and 5000.
t1 <- shapiro.test(df[df$Status=="Developing",]$Mortality.Rate)
t2 <- shapiro.test(df[df$Status=="Developed",]$Mortality.Rate)
t1
##
## Shapiro-Wilk normality test
##
## data: df[df$Status == "Developing", ]$Mortality.Rate
## W = 0.92642, p-value < 2.2e-16
t2
##
## Shapiro-Wilk normality test
##
## data: df[df$Status == "Developed", ]$Mortality.Rate
## W = 0.92301, p-value = 3.891e-14
t3 <- shapiro.test(log10(df[df$Status=="Developing",]$Mortality.Rate))
t4 <- shapiro.test(log10(df[df$Status=="Developed",]$Mortality.Rate))
t3
##
## Shapiro-Wilk normality test
##
## data: log10(df[df$Status == "Developing", ]$Mortality.Rate)
## W = 0.87263, p-value < 2.2e-16
t4
##
## Shapiro-Wilk normality test
##
## data: log10(df[df$Status == "Developed", ]$Mortality.Rate)
## W = 0.81979, p-value < 2.2e-16
The test returns a p value of 4.7364852^{-31} for Developing countries and 3.890908^{-14} for Developed countries. With an alpha of 0.05, we reject the null hypothesis and conclude that the distributions of both datasets are significantly different from normal. We also confirm as expected that the non-transformed data return the same conclusion (p values of 1.5775015^{-38} and 9.9551372^{-22}). Thus, a Wilcoxon Rank-Sum test would be approporate to test whether the two datasets are statistically different in their means (as opposed to a t-test, which requires that both datasets be normally distributed).
t <- wilcox.test(df[df$Status=="Developing",]$Mortality.Rate,
df[df$Status=="Developed",]$Mortality.Rate)
t
##
## Wilcoxon rank sum test with continuity correction
##
## data: df[df$Status == "Developing", ]$Mortality.Rate and df[df$Status == "Developed", ]$Mortality.Rate
## W = 701795, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
The test returns a p value of 6.2036371^{-66}. With an alpha of 0.05, we reject the null hypothesis and conclude that per-capita mortality rates are indeed distributed differently in developing versus developed countries. This suggests it may be a useful feature for our kNN model.
Next, we analyze outliers in each column (excluding the Country name and Status which are non-numerical as well as the year). We define an outlier as any value with an absolute Z-score of greater than 2.8 (i.e. greater than 2.8 standard deviations from the mean). We show all outliers below:
index_outliers <- function(x) {
x <- x[!is.na(x)]
x_mean <- mean(x)
x_sd <- sd(x)
z_scores <- abs(x - x_mean) / x_sd
# list("outliers" = x[z_scores > 2.8],
# "remaining" = x[z_scores <= 2.8],
# "indices" = z_scores > 2.8)
return(z_scores > 2.8)
}
lapply(df[-c(1,2,3)], function(x) {x[index_outliers(x)]})
## $Life.expectancy
## [1] 36.3 42.3 41.5 41.0 39.0
##
## $Adult.Mortality
## [1] 566 652 693 699 679 647 511 512 593 682 522 518 513 527 566 592 633 654 675
## [20] 666 648 622 586 543 525 559 587 615 613 599 588 513 519 533 564 587 568 536
## [39] 523 539 554 526 554 578 611 614 527 587 632 717 723 715 686 665
##
## $infant.deaths
## [1] 391 422 457 490 957 1000 1100 1100 1200 1300 1300 1400 1500 1500 1600
## [16] 1700 1700 1800 1800 490 498 505 513 521 527 536 542 549 556 563
## [31] 567 571 574 576 377 385
##
## $Alcohol
## [1] 16.35 17.31 16.99 17.87 16.58
##
## $percentage.expenditure
## [1] 10769.363 11734.854 11714.999 10986.265 8875.786 7172.275 8547.292
## [8] 8350.194 7878.372 8272.307 7423.229 8053.558 8329.732 7453.864
## [15] 7163.349 7191.052 9733.228 9748.636 8649.675 8433.937 7946.744
## [22] 10468.763 10261.763 10251.109 9765.617 10761.182 9703.068 7627.412
## [29] 6948.840 7002.786 7584.079 7641.271 8285.265 7777.556 8254.021
## [36] 6818.546 7613.815 12042.974 10631.204 8506.101 7620.823 9528.231
## [43] 9797.553 9126.931 8040.220 9498.729 7313.175 6799.664 16255.162
## [50] 15515.752 17028.528 18961.349 15345.491 12372.052 10111.389 7877.337
## [57] 8246.130 10873.406 9689.733 8344.010 10019.076 8342.406 6761.289
## [64] 15268.064 14829.412 12829.254 11792.535 7971.646 10947.023 11477.667
## [71] 8105.591 7593.392 19479.912 19099.045 18379.330 18822.867 14714.826
## [78] 11892.334 10598.082 10055.350 9495.541 6853.628
##
## $Hepatitis.B
## [1] 9 8 9 8 5 5 9 11 5 18 7 6 6 6 8 7 15 8 9 9 9 9 9 4 8
## [26] 8 7 7 14 14 14 14 14 4 17 9 9 7 6 9 8 9 7 9 8 8 8 8 9 9
## [51] 2 9 9 7 9 9 9 8 8 8 9 8 9 7 6 6 8 6 9 9 8 5 9 9 5
## [76] 5 9 8 8 9 6 17 5 8 7 7 9 9 8 8 9 8 9 9 9 9 9 9 4 8
## [101] 9 2 2 2 9 9 9 9 7 18 7 8 8 6 6 9 7 7 14 6 12 7 9 9 8
## [126] 7 9 9 8 8 9 8 9 9 9 9 9 5 8 7 4 6 7 6 5 8 14 8 9 7
##
## $Measles
## [1] 54118 52628 38159 52461 131441 109023 99602 124219 70549 71879
## [11] 58341 88962 71093 79563 33634 56188 44258 41144 64185 36711
## [21] 55443 47147 40044 51780 38835 33812 35558 62233 35256 118712
## [31] 63057 54190 61208 52852 110927 141258 42007 168107 212183 58848
## [41] 49871 48543 42554 42724
##
## $BMI
## numeric(0)
##
## $under.five.deaths
## [1] 511 558 608 1200 1300 1400 1500 1600 1700 1800 1900 2000 2000 2100 2200
## [16] 2300 2400 2500 759 773 788 802 817 832 848 863 879 893 907 918
## [31] 928 936 943
##
## $Polio
## [1] 6 5 4 4 3 9 9 8 8 9 9 9 9 8 8 9 9 9 9 9 9 9 9 9 8 8 9 4 4 3 3 9 9 9 8 7 7
## [38] 9 5 9 8 9 8 9 9 9 9 9 9 3 3 5 9 7 7 7 8 9 8 8 9 9 8 6 9 6 5 9 8 8 8 7 7 8
## [75] 9 9 9 7 8 9 6 5 8 9 9 9 8 9 7 8 6 7 7 8 8 9 9 9 9 8 9 8 8 4 6 4 7 7 8 8 8
## [112] 9 9 9 9 9 7 6 9 6 9 7 7 7 5 6 7 8 8 9 9 9 9 9 9 7 8 8 7 8 9 7
##
## $Total.expenditure
## [1] 13.66 14.39 12.60 13.73 13.71 13.38 12.77 13.76 13.83 13.44 12.94 13.13
## [13] 13.63
##
## $Diphtheria
## [1] 5 6 4 4 9 9 5 8 7 9 8 9 8 9 9 9 8 6 8 8 4 4 4 9 9 8 8 7 7 9 5 9 9 9 9 2 3
## [38] 4 9 9 5 4 3 9 7 8 8 9 9 8 8 6 6 5 8 6 9 6 6 9 7 7 9 9 9 9 8 8 7 9 5 8 8 9
## [75] 9 5 8 7 6 6 7 7 6 9 8 8 7 9 8 7 9 9 9 8 9 9 9 9 7 4 9 8 8 9 8 9 9 7 6 9 9
## [112] 9 7 8 8 8 8 8 7 5 8 9 9 9 9 9 9 9 9 9 8 5 9 9 7 7 5 7 8 7
##
## $HIV.AIDS
## [1] 20.6 28.4 31.9 34.6 37.2 38.8 16.9 18.1 18.2 27.3 30.0 34.1 34.8 34.6 33.8
## [16] 32.5 31.2 29.8 16.9 19.3 21.1 22.4 23.4 24.2 24.7 25.1 25.5 19.2 22.1 24.0
## [31] 24.7 24.6 23.9 22.8 19.0 23.5 26.4 28.1 29.5 29.7 28.9 26.6 24.0 21.3 21.6
## [46] 33.7 40.2 40.7 43.7 49.1 50.3 50.6 49.9 48.8 46.4 17.0 17.6 18.2 18.4 18.6
## [61] 18.7 18.1 20.5 23.7 26.8 30.3 33.6 36.7 39.8 42.1 43.5
##
## $GDP
## [1] 62214.69 67792.34 67677.63 62245.13 51874.85 49664.69 51322.64
## [8] 48333.57 51126.74 46657.63 47654.19 51386.38 46586.65 47439.40
## [15] 48424.59 47651.26 52413.72 52496.69 47447.48 46596.34 62425.54
## [22] 61191.19 61753.67 58163.29 64322.67 58487.45 48799.82 46511.65
## [29] 49914.62 49638.77 47415.56 48288.55 52473.11 55575.29 68348.32
## [36] 56249.76 46917.27 49231.36 52567.53 48538.59 51983.79 61235.42
## [43] 61388.17 54326.97 47631.64 48168.00 48399.96 51264.71 48268.59
## [50] 55572.00 119172.74 113751.85 115761.58 114293.84 89739.71 75716.35
## [57] 65445.89 48179.43 48736.00 52157.47 51574.49 49474.76 56928.82
## [64] 51241.32 87646.75 85128.66 74114.70 66775.39 86852.71 88564.82
## [71] 85948.75 61478.24 82967.37 51488.50 56336.72 54431.16 53166.68
## [78] 46569.68 57134.78 59593.29 55746.84 53324.38 46256.47 85814.59
## [85] 84658.89 83164.39 87998.44 74276.72 69672.47 72119.57 63223.47
## [92] 57348.93 54797.55 53255.98
##
## $Population
## [1] 198686688 196796269 194895996 186917361 184738458 182482149
## [7] 175287587 1293859294 1179681239 1161977719 1144118674 1126135777
## [13] 255131116 248883232 242524123 236159276 232989141 223614649
## [19] 185546257 181712595 177911533 174184265
##
## $thinness..1.19.years
## [1] 17.7 17.9 18.2 18.4 18.6 18.8 19.0 19.2 19.3 19.5 19.7 19.9 18.1 18.3 18.5
## [16] 18.7 18.9 19.1 19.3 19.5 19.7 19.9 17.8 18.0 18.3 18.6 18.9 19.2 26.8 26.8
## [31] 26.9 26.9 27.0 27.0 27.0 27.1 27.1 27.2 27.2 27.3 27.4 27.5 27.7 17.6 17.8
## [46] 18.0 18.2 18.3 18.5 19.4 19.6 19.8 21.0 21.2 21.4 21.6 21.8 22.0 22.2
##
## $thinness.5.9.years
## [1] 18.0 18.2 18.4 18.7 18.9 19.1 19.3 19.5 19.7 19.9 18.6 18.8 19.0 19.2 19.4
## [16] 19.7 19.9 21.1 21.3 21.5 18.1 18.3 18.6 18.8 19.1 19.4 19.6 19.9 27.4 27.5
## [31] 27.6 27.7 27.8 27.8 27.9 28.0 28.0 28.1 28.2 28.3 28.4 28.5 28.6 18.0 18.2
## [46] 18.4 18.6 18.8 19.0 19.2 19.8 21.1 21.3 21.5 21.7 21.8 22.0 22.2 22.4 22.6
## [61] 18.1 19.1
##
## $Income.composition.of.resources
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0
##
## $Schooling
## [1] 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.9 2.9 0.0 0.0 3.0 2.9 2.9 2.8 0.0
##
## $Mortality.Rate
## [1] 0.5660011 0.6520109 0.6930108 0.6990011 0.6790011 0.6470116 0.5110412
## [8] 0.5123733 0.5930206 0.6820023 0.5220019 0.5180019 0.5130138 0.5271629
## [15] 0.5661824 0.5920250 0.6330020 0.6540025 0.6750026 0.6660026 0.6480261
## [22] 0.6220260 0.5860027 0.5430027 0.5250025 0.5590267 0.5870028 0.6150316
## [29] 0.6130035 0.5990041 0.5880045 0.5130061 0.5190063 0.5330064 0.5640267
## [36] 0.5885848 0.5680173 0.5360186 0.5230435 0.5390046 0.5540476 0.5260027
## [43] 0.5540272 0.5780031 0.6110236 0.6140287 0.5270195 0.5870217 0.6320022
## [50] 0.7172163 0.7230021 0.7150021 0.6860020 0.6650020
The table below summarizes the number and percent of each column that are outliers:
outlier.summ <- lapply(df[-c(1,2,3)], function(x) { c(
NumOutliers = length(x[index_outliers(x)]),
PctOutliers = round(length(x[index_outliers(x)]) / length(x) * 100, 2)
)}) %>%
as.data.frame() %>%
t() %>%
as.data.frame() %>%
arrange(-PctOutliers)
kable(outlier.summ)
| NumOutliers | PctOutliers | |
|---|---|---|
| Hepatitis.B | 150 | 5.87 |
| Polio | 142 | 5.56 |
| Diphtheria | 140 | 5.48 |
| Income.composition.of.resources | 114 | 4.46 |
| GDP | 94 | 3.68 |
| percentage.expenditure | 82 | 3.21 |
| HIV.AIDS | 71 | 2.78 |
| thinness.5.9.years | 62 | 2.43 |
| thinness..1.19.years | 59 | 2.31 |
| Adult.Mortality | 54 | 2.11 |
| Mortality.Rate | 54 | 2.11 |
| Measles | 44 | 1.72 |
| infant.deaths | 36 | 1.41 |
| under.five.deaths | 33 | 1.29 |
| Population | 22 | 0.86 |
| Schooling | 16 | 0.63 |
| Total.expenditure | 13 | 0.51 |
| Life.expectancy | 5 | 0.20 |
| Alcohol | 5 | 0.20 |
| BMI | 0 | 0.00 |
Only 3 features have more than 5% outliers: Hepatitis.B, Polio, and Diphtheria. All outlier values originate from the small peak on the far left of the distribution, shown below. Considering that both developed and developing countries have this small peak and that the key distinguishing feature is the far right of the curve, it is reasonable to remove these observations. We choose to leave the remaining outliers as they make up such a small proportion of the data and are unlikely to sway the general trend. Another option is imputation, but we believe it is ill-advised to fabricate values when values already exist, and our earlier imputation of missing values has already brought the data much closer to the median.
map(rownames(outlier.summ[outlier.summ$PctOutliers>5,]), ~ {
df %>%
ggplot(aes(x=!!sym(.x), color=Status)) +
geom_density() +
theme_bw()
})
## [[1]]
##
## [[2]]
##
## [[3]]
mask <- lapply(df[c("Hepatitis.B", "Polio", "Diphtheria")], function(x) {index_outliers(x)}) # list of outlier logicals per feature
mask <- bind_rows(mask) # convert to dataframe
mask <- apply(mask, MARGIN=1, FUN=any) # check every row for an outlier in any feature
df <- df[!mask,]
To conclude, we have decided to eliminate observations with an outlier value for Hepatitis.B, Polio, or Diphtheria because these outliers constitute slightly greater than 5% of the column and do not appear to be very distinct in their distribution between developed vs developing countries. We have left the remaining outliers as they are small enough in number to have little impact on the overall trend. This leaves us with 2242 observations.
In this next phase, we prepare the data first by normalizing all numeric columns (i.e. everything except Country name and Status) using Z-score standardization. This is an important step because we do not want values that are intrinsically higher in magnitude, such as a year (thousands) or population (in hundreds of thousands) to shadow values that are intrinsically lower in magnitude, such as a rate (between 0 and 1) or counts of cases. We accomplish this by using lapply() to normalize every column: subtract the column’s mean then divide by the column’s standard deviation.
# Create function
normalize <- function(x) {
(x-mean(x, na.rm=TRUE))/sd(x, na.rm=TRUE)
}
# Apply to dataframe
# NOTE: Exclude the id column and the diagnosis column
df.norm <- cbind(Country = df$Country,
Status = df$Status,
as.data.frame(lapply(df[,c(-1, -3)], normalize)))
Next, we add a new, derived feature to the dataframe called disease that is the sum of the columns Hepatitis.B, Measles, Polio, HIV.AIDS, and Diphtheria. Note that we first calculate this from the un-normalized data then normalize the new column before adding it to the dataset. Recall that some of these diseases were correlated in our earlier finding. Also recall that roughly 20% of Hepatitis.B values were missing and addressed via mean-imputation.
df$Disease <- df$Hepatitis.B + df$Measles + df$Polio + df$HIV.AIDS + df$Diphtheria
df.norm$Disease <- normalize(df$Disease)
head(df.norm[c("Hepatitis.B", "Measles", "Polio", "HIV.AIDS", "Diphtheria", "Disease")], n=5) %>% kable()
| Hepatitis.B | Measles | Polio | HIV.AIDS | Diphtheria | Disease |
|---|---|---|---|---|---|
| -1.961569 | -0.1521442 | -2.114268 | -0.3113298 | -1.763941 | -0.1599907 |
| -1.812107 | -0.1579018 | -1.830861 | -0.3113298 | -1.626642 | -0.1650085 |
| -1.587914 | 0.0609785 | -1.476603 | -0.3113298 | -1.420694 | 0.0550350 |
| -1.513184 | 0.0819658 | -1.405751 | -0.3113298 | -1.352044 | 0.0763146 |
| -1.662645 | -0.0131269 | -1.547454 | -0.3113298 | -1.489343 | -0.0193969 |
Next, we split the dataset into Developing and Developed countries, randomly sample 15% of each and designate them as the validation set, and designate the remaining as the testing set. We confirm the distribution with a table.
split_train_val <- function(x, train_pct) {
# Generate random indices
rand_ind <- sample(seq(1,nrow(x)),
size = floor(train_pct * nrow(x)),
replace = FALSE)
# Split
df.train <- x[rand_ind,]
df.val <- x[-rand_ind,]
list(train = df.train, val = df.val)
}
# Segregate Developing and Developed
df_developing <- filter(df.norm, Status=="Developing")
df_developed <- filter(df.norm, Status=="Developed")
# Set seed to make reproducible
set.seed(-1)
# Split both
developing <- split_train_val(df_developing, train_pct=0.85)
developed <- split_train_val(df_developed, train_pct=0.85)
# Merge into one train and one test set
df.train <- rbind(developing$train, developed$train)
df.val <- rbind(developing$val, developed$val)
# Confirm
t <- rbind(table(df.norm$Status), table(df.train$Status), table(df.val$Status))
rownames(t) = c("Original", "TrainingSet", "ValidationSet")
kable(t)
| Developed | Developing | |
|---|---|---|
| Original | 412 | 1830 |
| TrainingSet | 350 | 1555 |
| ValidationSet | 62 | 275 |
Next, we apply the knn() function from class with k=6 to predict the country status for the following new data point.
Life expectancy = 67.4 | Adult Mortality = 293 | infant deaths = 4 | Alcohol = 2.68 | percentage expenditure = 40.7 | Hepatitis B = 40 |Measles = 671 | BMI = 14.2 | GDP = 687 | under-five deaths = 211 | Polio = 20 | Diphtheria = 97
The following steps were taken:
# Create a new point with some defined values and the rest median imputed
new_point <- lapply(df, FUN=median) %>%
as.data.frame()
new_point$Life.expectancy = 67.4
new_point$Adult.Mortality = 293
new_point$infant.deaths = 4
new_point$Alcohol = 2.68
new_point$percentage.expenditure = 40.7
new_point$Hepatitis.B = 40
new_point$Measles = 671
new_point$BMI = 14.2
new_point$GDP = 687
new_point$under.five.deaths = 211
new_point$Polio = 20
new_point$Diphtheria = 97
# Normalize the same way
means <- lapply(df, FUN=mean)
sds <- lapply(df, FUN=sd)
new_point <- rbind(new_point, means, sds) %>%
t() %>%
as.data.frame() %>%
rename(value = V1, means = V2, sds = V3) %>%
mutate(normalized = (value - means) / sds) %>%
select(normalized) %>%
t() %>%
as.data.frame()
model <- knn(train = df.train[-c(1,2)], # drop status and country
test = rbind(df.val, new_point)[-c(1,2)], # drop status and country
cl = factor(df.train$Status), # convert to factor
k = 6)
The model predicts this new datapoint belongs to the country status: Developing
Below, we test values of k from 3 to 10 and plot the resulting accuracy of the model (percentage of correction classifications with the validation set).
set.seed(-1)
accuracies <- map_df(seq(3,10), function(x) {
model <- knn(train = df.train[-c(1,2)], # drop status and country
test = df.val[-c(1,2)], # drop status and country
cl = factor(df.train$Status), # convert to factor
k = x)
list(k = x,
accuracy = sum(model == factor(df.val$Status))/nrow(df.val))
})
accuracies %>%
ggplot(aes(x=k, y=accuracy)) +
geom_point() +
geom_line() +
theme_bw() +
scale_y_continuous(labels=scales::percent) +
labs(title = "Analysis of kNN accuracy with a range of k values")
Out of curiosity, since the recommended value of k is typically the square root of the number of datapoints in the training set, we extend this exercise further:
set.seed(-1)
accuracies <- map_df(seq(3,sqrt(nrow(df.train))), function(x) {
model <- knn(train = df.train[-c(1,2)], # drop status and country
test = df.val[-c(1,2)], # drop status and country
cl = factor(df.train$Status), # convert to factor
k = x)
list(k = x,
accuracy = sum(model == factor(df.val$Status))/nrow(df.val))
})
accuracies %>%
ggplot(aes(x=k, y=accuracy)) +
geom_point() +
geom_line() +
theme_bw() +
scale_y_continuous(labels=scales::percent) +
labs(title = "Analysis of kNN accuracy with a range of k values")
We can see from both plots that a k value of 7 produces the most accurate prediction, and accuracy trends downward when increasing k from there. This is similar to the k value we selected for our prediction in the previous section. Considering that our sample sizes are 1905 and 337 for the training and validation sets respectively, this relatively small optimal k value indicates that features distinguishing developed versus developing countries are quite distinct. Overall, our model appears to be quite accurate.