DATA 606 Data Project Proposal

Data Preparation

#Load Libraries
library(tidyverse)
library(reshape2)
library(sqldf)

#Load data from github(testing with local files)
Gender_StatsData <- read.csv("C:/Users/aisha/Desktop/DATA606_pp/Gender_Stats_csv/Gender_StatsData.csv")
Gender_StatsSeries <- read.csv("C:/Users/aisha/Desktop/DATA606_pp/Gender_Stats_csv/Gender_StatsSeries.csv")

#Changing Column Name to merge two file systems
colnames(Gender_StatsData)[4] <- "Indicator"
colnames(Gender_StatsSeries)[1] <- "Indicator"

#Merging two files based on Indicator
Gender_data <- merge(x = Gender_StatsData, y = Gender_StatsSeries, by = "Indicator", all.x = TRUE)
colnames(Gender_data)[2] <- "Country"

#Identifying and generating the uniqueness by combining the Country Code and Indicator
Gender_data$Unq_Key <- paste0(Gender_data$Country.Code,".",Gender_data$Indicator, collapse = NULL)

#Picking bottom 10 Countries based on very low GDP Rate
#NY.GDP.MKTP.CD Stands for GDP (current US$)
Gender_data_GDP <- Gender_data[Gender_data$Indicator == 'NY.GDP.MKTP.CD',]

#Applying tidying function to pick bottom 10 countries based on low GDP
df <- Gender_data_GDP %>% mutate_all(funs(replace_na(.,0)))
df$sum_gdp <- rowSums(df[,c(5:63)])
bottom10 <- df %>% select(Country,Country.Code, sum_gdp) %>% filter(sum_gdp != 0) %>% arrange((sum_gdp)) %>% top_n(-10)
bottom10

##                  Country Country.Code    sum_gdp
## 1                 Tuvalu          TUV  617142618
## 2                  Nauru          NRU  871896578
## 3  Sao Tome and Principe          STP 3527878137
## 4                  Palau          PLW 3673161400
## 5               Kiribati          KIR 3783413451
## 6       Marshall Islands          MHL 4295623730
## 7         Cayman Islands          CYM 5523120579
## 8  Micronesia, Fed. Sts.          FSM 7627666300
## 9                  Tonga          TON 8589302695
## 10        American Samoa          ASM 9339000000

#Filter data only the selected countries with given indicators
country10 <- Gender_data[Gender_data$Country.Code %in%  c('TUV', 'NRU', 'STP','PLW','KIR','MHL','CYM','FSM','TON','ASM'),]
#Further filter based on the Indicators
#NY.GDP.MKTP.CD - market GDP
#SP.DYN.CBRT.IN - birth Rate
#SP.DYN.CDRT.IN - death rate
#SP.DYN.TFRT.IN - fertility rate
#FP.CPI.TOTL.ZG - Inflation Rate
country_indicators <- country10[country10$Indicator %in%  c('NY.GDP.MKTP.CD','SP.DYN.CBRT.IN','SP.DYN.CDRT.IN','FP.CPI.TOTL.ZG'),]

Research question

You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.

To find the gender statistics on bottom 10 lower GDP countries in the world and identify the area need to focused for improvement
The factor used for this project are : Gender popuation against education/birth/death/fertility rates and country inflation

Cases

What are the cases, and how many are there?

Each case represent the various gender statistics metrics between the year 1960 - 2018, There are 138864 observations

Data collection

Describe the method of data collection.

The Gender Statistics, The World Bank database is a comprehensive source for the latest sex-disaggregated data and gender statistics covering demography, education, health, access to economic opportunities, public life and decision-making, and agency.

https://datacatalog.worldbank.org/dataset/gender-statistics

Type of study

What type of study is this (observational/experiment)?

This is an observational study.

Data Source

If you collected the data, state self-collected. If not, provide a citation/link.

Data is collected by World Bank and is available online here: https://databank.worldbank.org/data/download/Gender_Stats_csv.zip

This project uses the same file from github location

https://github.com/thasleem1/DATA606/tree/master/Gender_Stats_csv

External Contact Email: data@worldbank.org
Data Temporal Coverage: 1960 - 2018
Data Last Updated: January 25, 2019

Response

What is the response variable, and what type is it (numerical/categorical)?

The response variable is numerical based on country performance (captured numerical) on education/birth/death/fertility rates and country inflation

Explanatory

What is the explanatory variable, and what type is it (numerical/categorical)?

The explanatory variable is Categorical (bottom 10 lower GDP countries)

Relevant summary statistics

Provide summary statistics relevant to your research question. For example, if you’re comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.

#Analysis of Tonga
Tonga <- sqldf("select * from Gender_data where `Country.Code` = 'TON' and Indicator IN ('SP.DYN.CBRT.IN','SP.DYN.CDRT.IN','SP.DYN.TFRT.IN','FP.CPI.TOTL.ZG')");
Tonga_birthrate <-Tonga %>% select(4:63)

#Reshaping the data
mdata <- melt(Tonga_birthrate, id=c("Indicator.Name.x"))
mdata_chk <- sqldf("select case when `Indicator.Name.x` = 'Inflation, consumer prices (annual %)' then 'Inflation' 
when `Indicator.Name.x` = 'Birth rate, crude (per 1,000 people)' then 'Birth'
when `Indicator.Name.x` = 'Death rate, crude (per 1,000 people)' then 'Death'
when `Indicator.Name.x` = 'Fertility rate, total (births per woman)' then 'Fertility'
else `Indicator.Name.x`
end as Indicator,substr(variable,2,5) as year,case when value is null then 0 else value end as value from mdata");
head(mdata_chk)

##   Indicator year  value
## 1 Inflation 1960  0.000
## 2     Birth 1960 47.175
## 3     Death 1960  9.034
## 4 Fertility 1960  7.363
## 5 Inflation 1961  0.000
## 6     Birth 1961 46.721

Inflation_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Inflation')
summary(Inflation_summary)

##   Indicator             year               value       
##  Length:59          Length:59          Min.   :-1.054  
##  Class :character   Class :character   1st Qu.: 0.000  
##  Mode  :character   Mode  :character   Median : 3.274  
##                                        Mean   : 5.131  
##                                        3rd Qu.: 9.125  
##                                        Max.   :22.393

Birth_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Birth')
summary(Birth_summary)

##   Indicator             year               value      
##  Length:59          Length:59          Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:28.34  
##  Mode  :character   Mode  :character   Median :31.47  
##                                        Mean   :31.61  
##                                        3rd Qu.:34.45  
##                                        Max.   :47.17

Death_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Death')
summary(Death_summary)

##   Indicator             year               value      
##  Length:59          Length:59          Min.   :0.000  
##  Class :character   Class :character   1st Qu.:6.035  
##  Mode  :character   Mode  :character   Median :6.168  
##                                        Mean   :6.223  
##                                        3rd Qu.:6.247  
##                                        Max.   :9.034

Fertility_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Fertility')
summary(Fertility_summary)

##   Indicator             year               value      
##  Length:59          Length:59          Min.   :0.000  
##  Class :character   Class :character   1st Qu.:4.189  
##  Mode  :character   Mode  :character   Median :4.691  
##                                        Mean   :4.897  
##                                        3rd Qu.:5.537  
##                                        Max.   :7.363

par(mfrow=c(2,2))
plot(Inflation_summary$year, Inflation_summary$value, type="o")
title(main = "Inflation")
plot(Birth_summary$year, Birth_summary$value, type="o")
title(main = "Birth")
plot(Death_summary$year, Death_summary$value, type="o")
title(main = "Death")
plot(Fertility_summary$year, Fertility_summary$value, type="o")
title(main = "Fertility")