#Load Libraries
library(tidyverse)
library(reshape2)
library(sqldf)
#Load data from github(testing with local files)
Gender_StatsData <- read.csv("C:/Users/aisha/Desktop/DATA606_pp/Gender_Stats_csv/Gender_StatsData.csv")
Gender_StatsSeries <- read.csv("C:/Users/aisha/Desktop/DATA606_pp/Gender_Stats_csv/Gender_StatsSeries.csv")
#Changing Column Name to merge two file systems
colnames(Gender_StatsData)[4] <- "Indicator"
colnames(Gender_StatsSeries)[1] <- "Indicator"
#Merging two files based on Indicator
Gender_data <- merge(x = Gender_StatsData, y = Gender_StatsSeries, by = "Indicator", all.x = TRUE)
colnames(Gender_data)[2] <- "Country"
#Identifying and generating the uniqueness by combining the Country Code and Indicator
Gender_data$Unq_Key <- paste0(Gender_data$Country.Code,".",Gender_data$Indicator, collapse = NULL)
#Picking bottom 10 Countries based on very low GDP Rate
#NY.GDP.MKTP.CD Stands for GDP (current US$)
Gender_data_GDP <- Gender_data[Gender_data$Indicator == 'NY.GDP.MKTP.CD',]
#Applying tidying function to pick bottom 10 countries based on low GDP
df <- Gender_data_GDP %>% mutate_all(funs(replace_na(.,0)))
df$sum_gdp <- rowSums(df[,c(5:63)])
bottom10 <- df %>% select(Country,Country.Code, sum_gdp) %>% filter(sum_gdp != 0) %>% arrange((sum_gdp)) %>% top_n(-10)
bottom10
## Country Country.Code sum_gdp
## 1 Tuvalu TUV 617142618
## 2 Nauru NRU 871896578
## 3 Sao Tome and Principe STP 3527878137
## 4 Palau PLW 3673161400
## 5 Kiribati KIR 3783413451
## 6 Marshall Islands MHL 4295623730
## 7 Cayman Islands CYM 5523120579
## 8 Micronesia, Fed. Sts. FSM 7627666300
## 9 Tonga TON 8589302695
## 10 American Samoa ASM 9339000000
#Filter data only the selected countries with given indicators
country10 <- Gender_data[Gender_data$Country.Code %in% c('TUV', 'NRU', 'STP','PLW','KIR','MHL','CYM','FSM','TON','ASM'),]
#Further filter based on the Indicators
#NY.GDP.MKTP.CD - market GDP
#SP.DYN.CBRT.IN - birth Rate
#SP.DYN.CDRT.IN - death rate
#SP.DYN.TFRT.IN - fertility rate
#FP.CPI.TOTL.ZG - Inflation Rate
country_indicators <- country10[country10$Indicator %in% c('NY.GDP.MKTP.CD','SP.DYN.CBRT.IN','SP.DYN.CDRT.IN','FP.CPI.TOTL.ZG'),]
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
To find the gender statistics on bottom 10 lower GDP countries in the world and identify the area need to focused for improvement
The factor used for this project are : Gender popuation against education/birth/death/fertility rates and country inflation
What are the cases, and how many are there?
Each case represent the various gender statistics metrics between the year 1960 - 2018, There are 138864 observations
Describe the method of data collection.
The Gender Statistics, The World Bank database is a comprehensive source for the latest sex-disaggregated data and gender statistics covering demography, education, health, access to economic opportunities, public life and decision-making, and agency.
What type of study is this (observational/experiment)?
This is an observational study.
If you collected the data, state self-collected. If not, provide a citation/link.
Data is collected by World Bank and is available online here: https://databank.worldbank.org/data/download/Gender_Stats_csv.zip
This project uses the same file from github location
https://github.com/thasleem1/DATA606/tree/master/Gender_Stats_csv
External Contact Email: data@worldbank.org
Data Temporal Coverage: 1960 - 2018
Data Last Updated: January 25, 2019
What is the response variable, and what type is it (numerical/categorical)?
The response variable is numerical based on country performance (captured numerical) on education/birth/death/fertility rates and country inflation
What is the explanatory variable, and what type is it (numerical/categorical)?
The explanatory variable is Categorical (bottom 10 lower GDP countries)
Provide summary statistics relevant to your research question. For example, if youโre comparing means across groups provide means, SDs, sample sizes of each group. This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
#Analysis of Tonga
Tonga <- sqldf("select * from Gender_data where `Country.Code` = 'TON' and Indicator IN ('SP.DYN.CBRT.IN','SP.DYN.CDRT.IN','SP.DYN.TFRT.IN','FP.CPI.TOTL.ZG')");
Tonga_birthrate <-Tonga %>% select(4:63)
#Reshaping the data
mdata <- melt(Tonga_birthrate, id=c("Indicator.Name.x"))
mdata_chk <- sqldf("select case when `Indicator.Name.x` = 'Inflation, consumer prices (annual %)' then 'Inflation'
when `Indicator.Name.x` = 'Birth rate, crude (per 1,000 people)' then 'Birth'
when `Indicator.Name.x` = 'Death rate, crude (per 1,000 people)' then 'Death'
when `Indicator.Name.x` = 'Fertility rate, total (births per woman)' then 'Fertility'
else `Indicator.Name.x`
end as Indicator,substr(variable,2,5) as year,case when value is null then 0 else value end as value from mdata");
head(mdata_chk)
## Indicator year value
## 1 Inflation 1960 0.000
## 2 Birth 1960 47.175
## 3 Death 1960 9.034
## 4 Fertility 1960 7.363
## 5 Inflation 1961 0.000
## 6 Birth 1961 46.721
Inflation_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Inflation')
summary(Inflation_summary)
## Indicator year value
## Length:59 Length:59 Min. :-1.054
## Class :character Class :character 1st Qu.: 0.000
## Mode :character Mode :character Median : 3.274
## Mean : 5.131
## 3rd Qu.: 9.125
## Max. :22.393
Birth_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Birth')
summary(Birth_summary)
## Indicator year value
## Length:59 Length:59 Min. : 0.00
## Class :character Class :character 1st Qu.:28.34
## Mode :character Mode :character Median :31.47
## Mean :31.61
## 3rd Qu.:34.45
## Max. :47.17
Death_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Death')
summary(Death_summary)
## Indicator year value
## Length:59 Length:59 Min. :0.000
## Class :character Class :character 1st Qu.:6.035
## Mode :character Mode :character Median :6.168
## Mean :6.223
## 3rd Qu.:6.247
## Max. :9.034
Fertility_summary <- mdata_chk %>% select(Indicator,year,value) %>% filter(Indicator == 'Fertility')
summary(Fertility_summary)
## Indicator year value
## Length:59 Length:59 Min. :0.000
## Class :character Class :character 1st Qu.:4.189
## Mode :character Mode :character Median :4.691
## Mean :4.897
## 3rd Qu.:5.537
## Max. :7.363
par(mfrow=c(2,2))
plot(Inflation_summary$year, Inflation_summary$value, type="o")
title(main = "Inflation")
plot(Birth_summary$year, Birth_summary$value, type="o")
title(main = "Birth")
plot(Death_summary$year, Death_summary$value, type="o")
title(main = "Death")
plot(Fertility_summary$year, Fertility_summary$value, type="o")
title(main = "Fertility")