Setup


# Load the necessary packages required to reproduce the report. For example:


library(magrittr)
library(knitr) 
library(kableExtra)
library(readr) 
library(tidyr)
library(dplyr)
library(magrittr) 
library(stringi)  
library(editrules)

Student names, numbers and percentage of contributions



Executive Summary

#The goal of this report is to process data before it is used by  understanding, organising, manipulating, scanning, and adjusting data.Country wise malnutrition and life expectancy datasets are used and merged on the common column ‘Country’. For numeric variables, the box plot is made using Tukey’s method for finding outliers.
#Histograms are used to look at how all numerical variables are spread out.Make sure that the data is free of mistakes, fits together well, and is ready for further statistical analysis.

Data

Dataset 1 ‘country wise life expectancy’ URL: https://www.kaggle.com/amansaxena/lifeexpectancy?select=Life_expectancy_dataset.csv

Dataset 2 ‘malnutrition data of various countries’ URL: https://www.kaggle.com/ruchi798/malnutrition-across-the-globe?select=country-wise-average.csv


# 1st Dataset
LE_df1 <- read_csv("Life_expectancy_dataset.csv")
spec(LE_df1)

colnames(LE_df1)[4] <- "Male_life" # Rename column  
colnames(LE_df1)[5] <- "Female_life" # Rename column 
# "Overall Life" column is not required
LE_df1 <- LE_df1[ -c(3) ] 

knitr::kable(head(LE_df1), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))

#convert to tidy format and check last rows
LE_df1_tidy <- pivot_longer(LE_df1, names_to = "Gender", values_to = "Life Expectancy", cols = 3:4)
LE_df1_tidy$Country <-  stringi::stri_trans_toupper(LE_df1_tidy$Country) 

knitr::kable(tail(LE_df1_tidy), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped")) 

# 2nd Dataset
CW_df2 <- read_csv("country-wise-average.csv")
spec(CW_df2)

knitr::kable(head(CW_df2), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))  

#finding malnutrition and life expectancy across countries.
df_merged <- left_join(CW_df2,LE_df1_tidy,by="Country")
#check last rows of merged dataset
knitr::kable(tail(df_merged), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))



Understand

I will look at the data and try to figure out what it means by checking its size, attributes, and structure. Then I’ll go ahead and change the type of the columns when necessary.

# This is the R chunk for the Understand Section

spec(df_merged)
str(df_merged)
#Continent, Gender and Income Classification seemed to have wrong classifications and fix them
df_merged$Continent <- factor(df_merged$Continent,
                              levels = c('Africa', 'Europe', 'Asia', 'Oceania', 'North America', 'South America')) 
df_merged$Gender <- factor(df_merged$Gender,
                           levels = c('Male', 'Female'))

df_merged$'Income Classification' <- factor(df_merged$'Income Classification',
                                            levels = c(0,1,2,3),
                                            labels = c('Low Income', 'Lower Middle Income', 'Upper Middle Income', 'High Income'),
                                            ordered = TRUE) 
attributes(df_merged)

Tidy & Manipulate Data I

# Before merging it with data set 2, the steps needed to clean up the data have already been taken. Please look at the section "Data."
# refer Line 26 above : reshaped the data into tidy format (this is for 'Tidy & Manipulate Data I' section)



Tidy & Manipulate Data II

# This is the R chunk for the Tidy & Manipulate Data II 
# To figure out how many children under 5 years old are either underweight or overweight, new column is created 
df_mutated <- mutate(df_merged, 'Under5 Weight Population'=(df_merged$Underweight+df_merged$Overweight)*df_merged$"U5 Population ('000s)"/100)

kable(tail(df_mutated[c(1,2,5,7,8,11:13)]), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))



Scan I

Missing values are scanned and omitted at this step


sapply(df_merged, function(x) sum( is.na(x) ))

#Removing missing data values and no special values are found
df_merged <- df_merged[complete.cases(df_merged), ]
sapply(df_merged, function(x) sum( is.na(x) ))



Scan II

Box plot for each of the numerical variables using Tukey’s method for finding outliers.



colnames(df_merged)[8] <- "Under 5 Population in thousands" 

df_merged$'Under 5 Population in thousands' %>% boxplot(main="Under 5 years Population in thousands", ylab="Population", col = "red")  


#Plot stunting percentage and life expectancy
par(mfrow=c(1,2))
df_merged$Stunting %>% boxplot(main="Stunting %", ylab="Percentage", col = "red")
df_merged$`Life Expectancy` %>% boxplot(main="Life expectancy", ylab="Age", col = "red")
par(mfrow=c(1,1)) 

#Plot numerical variables
par(mfrow=c(2,2))
df_merged$'Severe Wasting' %>% boxplot(main="Severe Wasting %", ylab="Percentage",col = "red")
df_merged$'Wasting' %>% boxplot(main="Wasting %", ylab="Percentage", col = "red")
df_merged$`Overweight` %>% boxplot(main="Over weight %", ylab="Percentage", col = "red")
df_merged$`Underweight` %>% boxplot(main="Under weight %", ylab="Percentage", col = "red")

par(mfrow=c(1,1))

par(mfrow=c(1,2))
df_merged$Stunting %>% boxplot(main="Stunting %", ylab="Percentage", col = "red")
df_merged$`Life Expectancy` %>% boxplot(main="Life expectancy", ylab="Age", col = "red")
par(mfrow=c(1,1))



Transform

#Check out the histograms of all the number values and choose variable “wasting” as transformation #Square root transformation gives a better response than the log transformation.

#explore the histograms of all the numerical variables.
par(mfrow =c(3,3))

under5_population <-  df_merged$`Under 5 Population in thousands`
hist(under5_population)

under_weight <- df_merged$`Underweight`
hist(under_weight)

over_weight<- df_merged$`Overweight`
hist(over_weight)

wasting <- df_merged$`Wasting`
hist(wasting)

severe_wasting <- df_merged$`Severe Wasting`
hist(severe_wasting)

stunting <- df_merged$Stunting
hist(stunting)

life_expectancy <- df_merged$`Life Expectancy`
hist(life_expectancy)
par(mfrow=c(1,1))


# convert the distribution into a normal distribution
sqrt_wasting <- sqrt(wasting)
hist(sqrt_wasting)

# reduce the right skewness
log_wasting <- log10(wasting)
hist(log_wasting)

ln_wasting <- log(wasting)
hist(ln_wasting)



---
title: "Data Wrangling (Data Preprocessing)"
author: "EI THIRI LWIN"
subtitle: Practical assessment 2
date: "3 OCT 2022"
output:
  html_notebook: default
  pdf_document: default
  html_document:
    df_print: paged
---


## **Setup**

```{r}

# Load the necessary packages required to reproduce the report. For example:


library(magrittr)
library(knitr) 
library(kableExtra)
library(readr) 
library(tidyr)
library(dplyr)
library(magrittr) 
library(stringi)  
library(editrules)
```


## **Student names, numbers and percentage of contributions**
```{r, echo=FALSE}


na<- c("EI THIRI LWIN")
no<- c("S3866360")
pc<- c("100%")

s<- data.frame(cbind(na,no,pc))
colnames(s)<- c("Student name", "Student number", "Percentage of contribution")

s %>% kbl(caption = "Individual") %>%
  kable_classic(full_width = F, html_font = "Cambria")

```
<br>
<br>

## **Executive Summary**

#The goal of this report is to process data before it is used by  understanding, organising, manipulating, scanning, and adjusting data.Country wise malnutrition and life expectancy datasets are used and merged on the common column ‘Country’. For numeric variables, the box plot is made using Tukey's method for finding outliers.
<br>
#Histograms are used to look at how all numerical variables are spread out.Make sure that the data is free of mistakes, fits together well, and is ready for further statistical analysis.
<br>

## **Data**

Dataset 1 'country wise life expectancy'
URL: https://www.kaggle.com/amansaxena/lifeexpectancy?select=Life_expectancy_dataset.csv


Dataset 2 'malnutrition data of various countries '
URL: https://www.kaggle.com/ruchi798/malnutrition-across-the-globe?select=country-wise-average.csv

```{r}

# 1st Dataset
LE_df1 <- read_csv("Life_expectancy_dataset.csv")
spec(LE_df1)

colnames(LE_df1)[4] <- "Male_life" # Rename column  
colnames(LE_df1)[5] <- "Female_life" # Rename column 
# "Overall Life" column is not required
LE_df1 <- LE_df1[ -c(3) ] 

knitr::kable(head(LE_df1), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))

#convert to tidy format and check last rows
LE_df1_tidy <- pivot_longer(LE_df1, names_to = "Gender", values_to = "Life Expectancy", cols = 3:4)
LE_df1_tidy$Country <-  stringi::stri_trans_toupper(LE_df1_tidy$Country) 

knitr::kable(tail(LE_df1_tidy), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped")) 

# 2nd Dataset
CW_df2 <- read_csv("country-wise-average.csv")
spec(CW_df2)

knitr::kable(head(CW_df2), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))  

#finding malnutrition and life expectancy across countries.
df_merged <- left_join(CW_df2,LE_df1_tidy,by="Country")
#check last rows of merged dataset
knitr::kable(tail(df_merged), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))


```

<br>
<br>

## **Understand** 

I will look at the data and try to figure out what it means by checking its size, attributes, and structure. Then I'll go ahead and change the type of the columns when necessary.
<br>
<br>

```{r}
# This is the R chunk for the Understand Section

spec(df_merged)
str(df_merged)
#Continent, Gender and Income Classification seemed to have wrong classifications and fix them
df_merged$Continent <- factor(df_merged$Continent,
                              levels = c('Africa', 'Europe', 'Asia', 'Oceania', 'North America', 'South America')) 
df_merged$Gender <- factor(df_merged$Gender,
                           levels = c('Male', 'Female'))

df_merged$'Income Classification' <- factor(df_merged$'Income Classification',
                                            levels = c(0,1,2,3),
                                            labels = c('Low Income', 'Lower Middle Income', 'Upper Middle Income', 'High Income'),
                                            ordered = TRUE) 
attributes(df_merged)
```



##	**Tidy & Manipulate Data I **


```{r}
# Before merging it with data set 2, the steps needed to clean up the data have already been taken. Please look at the section "Data."
# refer Line 26 above : reshaped the data into tidy format (this is for 'Tidy & Manipulate Data I' section)
```

<br>
<br>

## **Tidy & Manipulate Data II** 

```{r}
# This is the R chunk for the Tidy & Manipulate Data II 
# To figure out how many children under 5 years old are either underweight or overweight, new column is created 
df_mutated <- mutate(df_merged, 'Under5 Weight Population'=(df_merged$Underweight+df_merged$Overweight)*df_merged$"U5 Population ('000s)"/100)

kable(tail(df_mutated[c(1,2,5,7,8,11:13)]), format = "html", align = 'l')%>%
kable_styling(bootstrap_options = c("striped"))
```
 

<br>
<br>

##	**Scan I **
# Missing values are scanned and omitted at this step
```{r}

sapply(df_merged, function(x) sum( is.na(x) ))

#Removing missing data values and no special values are found
df_merged <- df_merged[complete.cases(df_merged), ]
sapply(df_merged, function(x) sum( is.na(x) ))

```

<br>
<br>

##	**Scan II**
# Box plot for each of the numerical variables using Tukey's method for finding outliers.
```{r}


colnames(df_merged)[8] <- "Under 5 Population in thousands" 

df_merged$'Under 5 Population in thousands' %>% boxplot(main="Under 5 years Population in thousands", ylab="Population", col = "red")  


#Plot stunting percentage and life expectancy
par(mfrow=c(1,2))
df_merged$Stunting %>% boxplot(main="Stunting %", ylab="Percentage", col = "red")
df_merged$`Life Expectancy` %>% boxplot(main="Life expectancy", ylab="Age", col = "red")
par(mfrow=c(1,1)) 

#Plot numerical variables
par(mfrow=c(2,2))
df_merged$'Severe Wasting' %>% boxplot(main="Severe Wasting %", ylab="Percentage",col = "red")
df_merged$'Wasting' %>% boxplot(main="Wasting %", ylab="Percentage", col = "red")
df_merged$`Overweight` %>% boxplot(main="Over weight %", ylab="Percentage", col = "red")
df_merged$`Underweight` %>% boxplot(main="Under weight %", ylab="Percentage", col = "red")

par(mfrow=c(1,1))

par(mfrow=c(1,2))
df_merged$Stunting %>% boxplot(main="Stunting %", ylab="Percentage", col = "red")
df_merged$`Life Expectancy` %>% boxplot(main="Life expectancy", ylab="Age", col = "red")
par(mfrow=c(1,1))


```


<br>
<br>

##	**Transform **
#Check out the histograms of all the number values and choose variable "wasting" as transformation
#Square root transformation gives a better response than the log transformation.
```{r}
#explore the histograms of all the numerical variables.
par(mfrow =c(3,3))

under5_population <-  df_merged$`Under 5 Population in thousands`
hist(under5_population)

under_weight <- df_merged$`Underweight`
hist(under_weight)

over_weight<- df_merged$`Overweight`
hist(over_weight)

wasting <- df_merged$`Wasting`
hist(wasting)

severe_wasting <- df_merged$`Severe Wasting`
hist(severe_wasting)

stunting <- df_merged$Stunting
hist(stunting)

life_expectancy <- df_merged$`Life Expectancy`
hist(life_expectancy)
par(mfrow=c(1,1))


# convert the distribution into a normal distribution
sqrt_wasting <- sqrt(wasting)
hist(sqrt_wasting)

# reduce the right skewness
log_wasting <- log10(wasting)
hist(log_wasting)

ln_wasting <- log(wasting)
hist(ln_wasting)
```
 

<br>
<br>

