# This is the R chunk for the required packages
library(readr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(deductive)
library(validate)
##
## Attaching package: 'validate'
## The following object is masked from 'package:dplyr':
##
## expr
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:validate':
##
## expr
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:validate':
##
## label, label<-
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(MVN)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
library(ggplot2)
The dataset which will be used is a merging of three different datasets. But before proceeding to the merging the data will go through the process of filtering and selecting to get rid of unnecessary variables.
After the verification of the summary statistics, missing values will be scan and impute.
The other steps will involved the detection and the removal of outleirs using the MVN package for multivariate outliers. And also the variable Total USD/capita to increase normality of the data.
The 3 datasets merged are Adult Education Level, Poverty Rate and Gross National Income obtained through the open dataset website of OECD Data. After the merging of the datasets, the following variables was used; LOCATION-country index TIME-year of obtaining data BUPPSRY-below upper secondary level of education UPPSRY- upper secondary level of education TRY-tertiary level of education Poverty rate - the total ratio of the number of people whose income is below the poverty line Total USD/capita - gross national income at current prices in terms of US Dollars per capita
Before merging the datasets, the individual datasets need to undergo the tidy process. And this will be included under the next step wuhich is Tidy & Manipulate I
#setwd and read data through read_csv function from the readr package
setwd("C:/Users/smj_l/Downloads")
Education<- read_csv("DP_LIVE_07062020052624591.csv")
## Parsed with column specification:
## cols(
## LOCATION = col_character(),
## INDICATOR = col_character(),
## SUBJECT = col_character(),
## MEASURE = col_character(),
## FREQUENCY = col_character(),
## TIME = col_double(),
## Value = col_double(),
## `Flag Codes` = col_logical()
## )
head(Education)
Poverty<- read_csv("DP_LIVE_07062020072746509.csv")
## Parsed with column specification:
## cols(
## LOCATION = col_character(),
## INDICATOR = col_character(),
## SUBJECT = col_character(),
## MEASURE = col_character(),
## FREQUENCY = col_character(),
## TIME = col_double(),
## Value = col_double(),
## `Flag Codes` = col_character()
## )
head(Poverty)
GNI <- read_csv("DP_LIVE_07062020051604963.csv")
## Parsed with column specification:
## cols(
## LOCATION = col_character(),
## INDICATOR = col_character(),
## SUBJECT = col_character(),
## MEASURE = col_character(),
## FREQUENCY = col_character(),
## TIME = col_double(),
## Value = col_double(),
## `Flag Codes` = col_character()
## )
head(GNI)
The first dataset which is Education has 3 level of education recorded under the variable “subject”. Using the spread() function, we will create 3 new variables which replace the “SUBJECT” and thus our dataset will be no more that long with less numbers of rows.
The second step in tidying the datasets is to subset them with the variables which can be useful and filter them for years after 2015 because the dataset “Poverty” have data for only 2015 to 2018.
The third step is to rename the variable “Value” in each datasets so as to avoid misunderstanding after the merging.
The merging process will be performed in two steps, which are i) to combine “sPoverty” with “sEducation” using the common variables LOCATION & TIME and name the output POVEDU ii) to merge POVEDU with “sGNI” still using the full join so that missing values is still included in the dataset
#spreading the factor subject to create new variables and decrease num of rows
EDUADULT <- spread(Education,key=SUBJECT,value = Value)
head(EDUADULT)
#tidy process involving selection of useful variables
sEDUCATION <- EDUADULT %>%
select(LOCATION,TIME,BUPPSRY,UPPSRY,TRY) %>%
filter(TIME >= 2015)
sPOVERTY <- Poverty %>%
select(LOCATION,TIME,Value)
sGNI <- GNI %>%
filter(TIME >= 2015 & MEASURE == "USD_CAP") %>%
select(LOCATION,TIME,Value)
#renaming value for each data.frame
names(sEDUCATION)[names(sEDUCATION) == "Value"] <- "% of 25-64yrs"
names(sPOVERTY)[names(sPOVERTY) == "Value"] <- "Poverty Rate"
names(sGNI)[names(sGNI) == "Value"] <- "Total USD/capita"
#merging data.frame in 2 steps
POVEDU <- sEDUCATION %>% full_join(sPOVERTY, by = c("LOCATION","TIME"))
ALL <- POVEDU %>% full_join(sGNI, by = c("LOCATION","TIME"))
head(ALL)
The attributes of the data was verified and it turns out that the variables have the need attributes but LOCATION can be converted from to a factor with multiple nominal levels.
The summary statistics is added in the this section inorder for us to understand the dataset for further manipulation like imputation of missing values.
# This is the R chunk for the Understand Section
str(ALL)
## tibble [216 x 7] (S3: tbl_df/tbl/data.frame)
## $ LOCATION : chr [1:216] "ARG" "ARG" "AUS" "AUS" ...
## $ TIME : num [1:216] 2017 2018 2015 2016 2017 ...
## $ BUPPSRY : num [1:216] 38.5 36.4 21 20.1 19 ...
## $ UPPSRY : num [1:216] 40.1 28 36.1 36.2 35.6 ...
## $ TRY : num [1:216] 21.4 35.7 42.9 43.7 45.4 ...
## $ Poverty Rate : num [1:216] NA NA NA 0.121 NA 0.124 0.087 0.098 0.094 NA ...
## $ Total USD/capita: num [1:216] NA NA 46186 48917 49695 ...
ALL$LOCATION <- as.factor(ALL$LOCATION)
#mean and median
ALL %>% summary(.,na.rm=TRUE)
## LOCATION TIME BUPPSRY UPPSRY TRY
## CAN : 5 Min. :2015 Min. : 4.757 Min. :16.23 Min. : 6.98
## CHL : 5 1st Qu.:2016 1st Qu.:12.356 1st Qu.:35.17 1st Qu.:24.01
## CRI : 5 Median :2016 Median :19.254 Median :41.16 Median :35.85
## EU28 : 5 Mean :2017 Mean :24.350 Mean :41.96 Mean :34.10
## FIN : 5 3rd Qu.:2017 3rd Qu.:28.578 3rd Qu.:50.24 3rd Qu.:42.86
## ITA : 5 Max. :2019 Max. :64.566 Max. :70.97 Max. :57.89
## (Other):186 NA's :52 NA's :52 NA's :48
## Poverty Rate Total USD/capita
## Min. :0.05400 Min. : 7111
## 1st Qu.:0.09075 1st Qu.:28818
## Median :0.12150 Median :40813
## Mean :0.12693 Mean :40341
## 3rd Qu.:0.16600 3rd Qu.:50400
## Max. :0.26600 Max. :91557
## NA's :112 NA's :22
A variable TOTAL will be created to represent the sum of the variables BUPPSRY, SUPPSRY and TRY. This variable will be necessary in the imputation of missing values for the different level of education. Note that TOTAL should be equal to 100 as the level of educations are a percentage of the population.
#creating new variable TOTAL
all1 <- mutate(ALL,TOTAL = c(BUPPSRY+UPPSRY+TRY))
The primary step in this section is to define the validate rules for the imputation of missing values for the different level of education using the new variable TOTAL created preceeding section.
The missing values in Poverty rate and Total USD/capita will be replaced by their mean and median respectively because they are numerical values.
The final step to this section is to verify if any NA is still present in the dataset and also to check for special values using a function taken from Dr.Anil Dolgun website for Module 6 (Scan: Outliers).
#defining Rules for education imputation
Rules <- validator(BUPPSRY+UPPSRY+TRY == TOTAL, BUPPSRY >= 0, UPPSRY >= 0, TRY >= 0)
IMPUTED_1 <- impute_lr(all1,Rules)
#replacing NA value in Poverty rate & Total USD/capita
X1 <- impute(ALL$`Poverty Rate`,fun=mean)
sum(is.imputed(X1))
## [1] 112
X2 <- impute(ALL$`Total USD/capita`, fun=median)
sum(is.imputed(X2))
## [1] 22
#checking for special values
is.specialorNA <- function(x){
if (is.numeric(x)) (is.infinite(x) | is.nan(x) | is.na(x))
}
sapply(ALL, is.specialorNA)
## $LOCATION
## NULL
##
## $TIME
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##
## $BUPPSRY
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $UPPSRY
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $TRY
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $`Poverty Rate`
## [1] TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [13] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [25] TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE
## [49] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [61] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [73] FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE
## [97] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [109] FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
## [121] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE
## [145] FALSE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [157] FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [181] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [205] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
##
## $`Total USD/capita`
## [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE
## [85] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [169] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
In this step, we will scan our dataset for outliers using the multivariate methods as we are dealing with more than 2 variables. This will measure the MAHALANOBIS DISTANCE with a QQ Plot. And then they will be excluded using the mvn() function as only 7 outliers are present.
#detect multivariate outliers
outlier <- ALL %>% dplyr::select(BUPPSRY,UPPSRY,TRY,`Poverty Rate`,`Total USD/capita`)
TOTAL <- mvn(data = outlier, multivariateOutlierMethod = "quan", showOutliers = TRUE)
## Warning in covMcd(data, alpha = alpha): The covariance matrix has become singular during
## the iterations of the MCD algorithm.
## There are 11 observations (in the entire dataset of 93 obs.) lying on
## the hyperplane with equation a_1*(x_i1 - m_1) + ... + a_p*(x_ip - m_p)
## = 0 with (m_1, ..., m_p) the mean of these observations and
## coefficients a_i from the vector a <- c(0.5773503, 0.5773503,
## 0.5773503, 5.2e-06, 0)
TOTAL$multivariateOutliers
#EXCLUDING OUTLIERS
TOTAL <- mvn(data = outlier, multivariateOutlierMethod = "quan", showOutliers = TRUE,showNewData = TRUE)
## Warning in covMcd(data, alpha = alpha): The covariance matrix has become singular during
## the iterations of the MCD algorithm.
## There are 13 observations (in the entire dataset of 93 obs.) lying on
## the hyperplane with equation a_1*(x_i1 - m_1) + ... + a_p*(x_ip - m_p)
## = 0 with (m_1, ..., m_p) the mean of these observations and
## coefficients a_i from the vector a <- c(0.5773503, 0.5773503,
## 0.5773503, -5.3e-06, 0)
We will select the variable Total USD/capita to undergo transformation in order to suit a normal distribution. The latter will be done using the square root transformation as it has the advantage that it can be applied to zero values.
And from the histogram obtained we can see that the data is more symmetrical.
#drawing histogram to see the distribution of the data
ggplot(ALL,aes(x=`Total USD/capita`)) + geom_histogram() + stat_function(fun = dnorm, args = list(mean = mean(ALL$`Total USD/capita`), sd = sd(ALL$`Total USD/capita`)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).
## Warning: Removed 101 row(s) containing missing values (geom_path).
#transforming data to compressing big values and spreading small values and see output through a basic histogram
sqrt_total <- sqrt(ALL$`Total USD/capita`)
hist(sqrt_total)
OECD (2020), Poverty rate (indicator). doi: 10.1787/0fe1315d-en (Accessed on 05 June 2020)
OECD (2020), Adult education level (indicator). doi: 10.1787/36bce3fe-en (Accessed on 07 June 2020)
OECD (2020), Gross national income (indicator). doi: 10.1787/8a36773a-en (Accessed on 05 June 2020)