Loading Data

# Load original education aid allocation as percentage of total aid
messy_edu_aid_per = read.csv('education_aid.csv', stringsAsFactors = FALSE, sep=',')

# Load original total aid in dollar
messy_total_aid_dollar = read.csv('total_aid.csv', stringsAsFactors = FALSE, sep=',')

# Unemployment Rate
messy_unemploy_rate = read.csv('unemployment.csv', stringsAsFactors = FALSE, sep=',')

Reshaping and Wrangle Data by Gathering Columns into Rows

# install.packages("tidyr")
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Transform time series data
tidy_edu_aid_per =messy_edu_aid_per %>% 
  gather('Year', 'Aid_Per', 2:44)

tidy_total_aid_dollar = messy_total_aid_dollar %>%
  gather('Year', 'Aid_Amount', 2:44)

tidy_unemploy_rate= messy_unemploy_rate %>%
  gather('Year', 'Unemploy_Rate', 2:27)

# Convert Year from boolean to Integer
tidy_edu_aid_per$Year_char = as.character(tidy_edu_aid_per$Year)
tidy_edu_aid_per$Year_char = sub('X', '', tidy_edu_aid_per$Year_char)
tidy_edu_aid_per$Year_int = as.integer(tidy_edu_aid_per$Year_char)

tidy_total_aid_dollar$Year_char = as.character(tidy_total_aid_dollar$Year)
tidy_total_aid_dollar$Year_char = sub('X', '', tidy_total_aid_dollar$Year_char)
tidy_total_aid_dollar$Year_int = as.integer(tidy_total_aid_dollar$Year_char)

tidy_unemploy_rate$Year_char = as.character(tidy_unemploy_rate$Year)
tidy_unemploy_rate$Year_char = sub('X', '', tidy_unemploy_rate$Year_char)
tidy_unemploy_rate$Year_int = as.integer(tidy_unemploy_rate$Year_char)

# Update Country Column Name
colnames(tidy_edu_aid_per)[1] = c('Country')
colnames(tidy_total_aid_dollar)[1] = c('Country')
colnames(tidy_unemploy_rate)[1] = c('Country')

Create new data frame for education aid in dollar

# Define column vectors for new data frame
Country = c(tidy_edu_aid_per$Country)
Year_int = c(tidy_edu_aid_per$Year_int)

# Education Aid in Dolloar = % of Education Aid allocation x Total Aid in Dollar
Aid_Dollar = c(tidy_edu_aid_per$Aid_Per*tidy_total_aid_dollar$Aid_Amount)

edu_aid_dollar = data.frame(Country, Year_int, Aid_Dollar)

Overall trend in education aid dollar comparing between US and Japan

library(ggplot2)

df = subset(edu_aid_dollar, edu_aid_dollar$Country == 'United States'|edu_aid_dollar$Country=='Japan' | edu_aid_dollar$Country=='Netherlands')

ggplot(data = df, aes(x=Year_int, y=Aid_Dollar/1000000000, group= Country, color = Country))+
  geom_line()+
  scale_x_continuous(breaks = seq(1970, 2010, 5))+
  scale_y_continuous(breaks = seq(0, 300, 30))+
  xlab('Year')+
  ylab('Education Aid in US Dollar (in Billion, valued in 2007)')+
  ggtitle('Comparison of Education Aid in US, Japan, and Netherlands')
## Warning in loop_apply(n, do.ply): Removed 19 rows containing missing values
## (geom_path).

Subset educaiton aid data, join with unemployment data, and export

#subset the education aid data
df_US_edu = subset(edu_aid_dollar, edu_aid_dollar$Country == 'United States' & edu_aid_dollar$Year >= 1981)
df_JPN_edu = subset(edu_aid_dollar, edu_aid_dollar$Country == 'Japan'& edu_aid_dollar$Year >= 1981)

#add unemployment rate data to respective subsets
df_US_unemploy= subset(tidy_unemploy_rate, tidy_unemploy_rate$Country == 'United States')
df_JPN_unemploy= subset(tidy_unemploy_rate, tidy_unemploy_rate$Country == 'Japan')

#join education aid and unemployment data by Year and select the key columns for output
df_US_unemploy$Year = as.integer(df_US_unemploy$Year)
df_US = dplyr::inner_join(df_US_edu, df_US_unemploy, by = "Year_int")
df_US = dplyr::select(df_US, Country.x, Year_int, Aid_Dollar, Unemploy_Rate)
df_US$Aid_Dollar = df_US$Aid_Dollar / 1000000000
colnames(df_US)[3] = c('Aid_Dollar_in_Billion')

df_JPN_unemploy$Year = as.integer(df_JPN_unemploy$Year)
df_JPN = dplyr::inner_join(df_JPN_edu, df_JPN_unemploy, by = "Year_int")
df_JPN = dplyr::select(df_JPN, Country.x, Year_int, Aid_Dollar, Unemploy_Rate)
df_JPN$Aid_Dollar = df_JPN$Aid_Dollar / 1000000000
colnames(df_JPN)[3] = c('Aid_Dollar_in_Billion')

#write these files to csv 
write.csv('US_edu_aid_unemploy.csv', x = df_US)
write.csv('JPN_edu_aid_unemploy.csv', x = df_JPN)