Loading Data
# Load original education aid allocation as percentage of total aid
messy_edu_aid_per = read.csv('education_aid.csv', stringsAsFactors = FALSE, sep=',')
# Load original total aid in dollar
messy_total_aid_dollar = read.csv('total_aid.csv', stringsAsFactors = FALSE, sep=',')
# Unemployment Rate
messy_unemploy_rate = read.csv('unemployment.csv', stringsAsFactors = FALSE, sep=',')
Reshaping and Wrangle Data by Gathering Columns into Rows
# install.packages("tidyr")
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Transform time series data
tidy_edu_aid_per =messy_edu_aid_per %>%
gather('Year', 'Aid_Per', 2:44)
tidy_total_aid_dollar = messy_total_aid_dollar %>%
gather('Year', 'Aid_Amount', 2:44)
tidy_unemploy_rate= messy_unemploy_rate %>%
gather('Year', 'Unemploy_Rate', 2:27)
# Convert Year from boolean to Integer
tidy_edu_aid_per$Year_char = as.character(tidy_edu_aid_per$Year)
tidy_edu_aid_per$Year_char = sub('X', '', tidy_edu_aid_per$Year_char)
tidy_edu_aid_per$Year_int = as.integer(tidy_edu_aid_per$Year_char)
tidy_total_aid_dollar$Year_char = as.character(tidy_total_aid_dollar$Year)
tidy_total_aid_dollar$Year_char = sub('X', '', tidy_total_aid_dollar$Year_char)
tidy_total_aid_dollar$Year_int = as.integer(tidy_total_aid_dollar$Year_char)
tidy_unemploy_rate$Year_char = as.character(tidy_unemploy_rate$Year)
tidy_unemploy_rate$Year_char = sub('X', '', tidy_unemploy_rate$Year_char)
tidy_unemploy_rate$Year_int = as.integer(tidy_unemploy_rate$Year_char)
# Update Country Column Name
colnames(tidy_edu_aid_per)[1] = c('Country')
colnames(tidy_total_aid_dollar)[1] = c('Country')
colnames(tidy_unemploy_rate)[1] = c('Country')
Create new data frame for education aid in dollar
# Define column vectors for new data frame
Country = c(tidy_edu_aid_per$Country)
Year_int = c(tidy_edu_aid_per$Year_int)
# Education Aid in Dolloar = % of Education Aid allocation x Total Aid in Dollar
Aid_Dollar = c(tidy_edu_aid_per$Aid_Per*tidy_total_aid_dollar$Aid_Amount)
edu_aid_dollar = data.frame(Country, Year_int, Aid_Dollar)
Overall trend in education aid dollar comparing between US and Japan
library(ggplot2)
df = subset(edu_aid_dollar, edu_aid_dollar$Country == 'United States'|edu_aid_dollar$Country=='Japan' | edu_aid_dollar$Country=='Netherlands')
ggplot(data = df, aes(x=Year_int, y=Aid_Dollar/1000000000, group= Country, color = Country))+
geom_line()+
scale_x_continuous(breaks = seq(1970, 2010, 5))+
scale_y_continuous(breaks = seq(0, 300, 30))+
xlab('Year')+
ylab('Education Aid in US Dollar (in Billion, valued in 2007)')+
ggtitle('Comparison of Education Aid in US, Japan, and Netherlands')
## Warning in loop_apply(n, do.ply): Removed 19 rows containing missing values
## (geom_path).

Subset educaiton aid data, join with unemployment data, and export
#subset the education aid data
df_US_edu = subset(edu_aid_dollar, edu_aid_dollar$Country == 'United States' & edu_aid_dollar$Year >= 1981)
df_JPN_edu = subset(edu_aid_dollar, edu_aid_dollar$Country == 'Japan'& edu_aid_dollar$Year >= 1981)
#add unemployment rate data to respective subsets
df_US_unemploy= subset(tidy_unemploy_rate, tidy_unemploy_rate$Country == 'United States')
df_JPN_unemploy= subset(tidy_unemploy_rate, tidy_unemploy_rate$Country == 'Japan')
#join education aid and unemployment data by Year and select the key columns for output
df_US_unemploy$Year = as.integer(df_US_unemploy$Year)
df_US = dplyr::inner_join(df_US_edu, df_US_unemploy, by = "Year_int")
df_US = dplyr::select(df_US, Country.x, Year_int, Aid_Dollar, Unemploy_Rate)
df_US$Aid_Dollar = df_US$Aid_Dollar / 1000000000
colnames(df_US)[3] = c('Aid_Dollar_in_Billion')
df_JPN_unemploy$Year = as.integer(df_JPN_unemploy$Year)
df_JPN = dplyr::inner_join(df_JPN_edu, df_JPN_unemploy, by = "Year_int")
df_JPN = dplyr::select(df_JPN, Country.x, Year_int, Aid_Dollar, Unemploy_Rate)
df_JPN$Aid_Dollar = df_JPN$Aid_Dollar / 1000000000
colnames(df_JPN)[3] = c('Aid_Dollar_in_Billion')
#write these files to csv
write.csv('US_edu_aid_unemploy.csv', x = df_US)
write.csv('JPN_edu_aid_unemploy.csv', x = df_JPN)