Data Load
#getwd()
raw_df <- read.csv("./Data/Unemployment_rate.csv", header = T)
head(raw_df, 2)
## By.gender By.age.group X1999..06 X1999..07 X1999..08 X1999..09
## 1 Total Total 6.7 6.7 6.2 5.2
## 2 Total 15-19 Years old 22.8 23.9 18.1 13.7
## X1999..10 X1999..11 X1999..12 X2000..01 X2000..02 X2000..03 X2000..04
## 1 5.0 4.8 5.2 5.7 5.7 5.1 4.5
## 2 13.1 13.8 19.0 19.7 15.0 15.7 14.8
## X2000..05 X2000..06 X2000..07 X2000..08 X2000..09 X2000..10 X2000..11
## 1 4.1 3.9 4.0 4.1 4.0 3.7 3.9
## 2 12.5 13.1 16.1 14.0 12.9 11.0 10.6
## X2000..12 X2001..01 X2001..02 X2001..03 X2001..04 X2001..05 X2001..06
## 1 4.4 5.1 5.5 5.1 4.1 3.7 3.6
## 2 16.7 16.3 18.9 19.5 15.3 12.4 14.4
## X2001..07 X2001..08 X2001..09 X2001..10 X2001..11 X2001..12 X2002..01
## 1 3.6 3.6 3.3 3.3 3.4 3.7 4.1
## 2 13.9 10.9 12.4 10.6 10.1 15.6 14.7
## X2002..02 X2002..03 X2002..04 X2002..05 X2002..06 X2002..07 X2002..08
## 1 4.1 3.8 3.4 3.1 2.9 3.0 3.2
## 2 13.4 13.9 12.9 9.4 10.8 12.5 10.1
## X2002..09 X2002..10 X2002..11 X2002..12 X2003..01 X2003..02 X2003..03
## 1 2.7 2.8 2.9 3.1 3.7 3.8 3.7
## 2 9.0 9.4 11.7 13.9 13.4 12.2 12.1
## X2003..04 X2003..05 X2003..06 X2003..07 X2003..08 X2003..09 X2003..10
## 1 3.5 3.4 3.4 3.6 3.5 3.3 3.4
## 2 9.9 9.5 15.2 16.0 9.7 10.5 12.8
## X2003..11 X2003..12 X2004..01 X2004..02 X2004..03 X2004..04 X2004..05
## 1 3.5 3.8 4.0 4.1 4 3.7 3.5
## 2 15.7 16.7 16.9 14.7 17 12.8 10.8
## X2004..06 X2004..07 X2004..08 X2004..09 X2004..10 X2004..11 X2004..12
## 1 3.4 3.6 3.6 3.3 3.4 3.5 3.8
## 2 18.0 15.5 10.1 8.4 10.3 11.7 19.0
## X2005..01 X2005..02 X2005..03 X2005..04 X2005..05 X2005..06 X2005..07
## 1 4.3 4.3 4.1 3.8 3.6 3.6 3.7
## 2 19.6 13.4 14.5 10.4 10.1 14.2 14.2
## X2005..08 X2005..09 X2005..10 X2005..11 X2005..12 X2006..01 X2006..02
## 1 3.6 3.6 3.6 3.3 3.5 3.8 4.1
## 2 7.6 7.4 7.6 7.7 15.1 12.4 11.2
## X2006..03 X2006..04 X2006..05 X2006..06 X2006..07 X2006..08 X2006..09
## 1 3.9 3.5 3.2 3.4 3.4 3.4 3.2
## 2 8.5 7.0 6.5 13.4 10.6 5.9 8.5
## X2006..10 X2006..11 X2006..12 X2007..01 X2007..02 X2007..03 X2007..04
## 1 3.3 3.2 3.3 3.6 3.7 3.5 3.4
## 2 13.7 11.3 15.7 9.4 10.4 9.1 8.0
## X2007..05 X2007..06 X2007..07 X2007..08 X2007..09 X2007..10 X2007..11
## 1 3.2 3.2 3.3 3.1 3.0 3.0 3.0
## 2 6.2 8.3 13.5 9.4 7.5 7.2 8.4
## X2007..12 X2008..01 X2008..02 X2008..03 X2008..04 X2008..05 X2008..06
## 1 3.1 3.3 3.5 3.4 3.2 3.1 3.1
## 2 11.6 11.2 7.3 5.9 7.8 5.7 15.8
## X2008..07 X2008..08 X2008..09 X2008..10 X2008..11 X2008..12 X2009..01
## 1 3.1 3.1 3.0 3.0 3.0 3.3 3.6
## 2 12.4 8.6 8.7 6.2 11.6 19.0 16.7
## X2009..02 X2009..03 X2009..04 X2009..05 X2009..06 X2009..07 X2009..08
## 1 3.9 4.0 3.8 3.8 3.8 3.7 3.7
## 2 11.6 12.1 9.6 7.7 19.1 15.9 9.6
## X2009..09 X2009..10 X2009..11 X2009..12 X2010..01 X2010..02 X2010..03
## 1 3.3 3.2 3.3 3.5 5.0 4.8 4.1
## 2 5.0 8.0 13.5 13.6 18.7 12.6 12.2
## X2010..04 X2010..05 X2010..06 X2010..07 X2010..08 X2010..09 X2010..10
## 1 3.7 3.2 3.5 3.7 3.3 3.4 3.3
## 2 12.3 7.9 20.3 11.6 5.7 7.1 5.9
## X2010..11 X2010..12 X2011..01 X2011..02 X2011..03 X2011..04 X2011..05
## 1 3.0 3.5 3.8 4.5 4.3 3.7 3.2
## 2 6.7 17.1 14.6 11.5 11.4 8.5 5.8
## X2011..06 X2011..07 X2011..08 X2011..09 X2011..10 X2011..11 X2011..12
## 1 3.3 3.3 3.0 3.0 2.9 2.9 3.0
## 2 12.5 16.1 7.7 8.1 7.8 7.8 12.6
## X2012..01 X2012..02 X2012..03 X2012..04 X2012..05 X2012..06 X2012..07
## 1 3.5 4.2 3.7 3.5 3.1 3.2 3.1
## 2 12.7 9.1 9.4 6.5 6.0 9.6 11.0
## X2012..08 X2012..09 X2012..10 X2012..11 X2012..12 X2013..01 X2013..02
## 1 3.0 2.9 2.8 2.8 2.9 3.4 3.9
## 2 5.8 8.8 6.7 7.8 11.3 10.2 10.0
## X2013..03 X2013..04 X2013..05 X2013..06 X2013..07 X2013..08 X2013..09
## 1 3.5 3.2 3.0 3.1 3.1 3.0 2.7
## 2 8.6 13.2 10.2 10.2 14.8 8.1 6.6
## X2013..10 X2013..11 X2013..12 X2014..01 X2014..02 X2014..03 X2014..04
## 1 2.7 2.6 3.0 3.4 4.5 3.9 3.8
## 2 6.5 7.3 15.3 16.6 10.5 7.3 8.1
## X2014..05 X2014..06 X2014..07 X2014..08 X2014..09 X2014..10 X2014..11
## 1 3.5 3.5 3.4 3.3 3.1 3.2 3.0
## 2 5.2 8.7 10.7 7.5 6.1 6.2 6.8
## X2014..12 X2015..01 X2015..02 X2015..03 X2015..04 X2015..05 X2015..06
## 1 3.3 3.7 4.5 4.0 3.9 3.7 3.8
## 2 13.5 10.7 11.6 11.1 9.0 6.7 15.3
## X2015..07 X2015..08 X2015..09 X2015..10 X2015..11 X2015..12 X2016..01
## 1 3.6 3.4 3.2 3.1 3.0 3.2 3.7
## 2 14.9 10.1 9.1 8.1 7.3 11.1 10.2
## X2016..02 X2016..03 X2016..04 X2016..05 X2016..06 X2016..07 X2016..08
## 1 4.9 4.2 3.9 3.6 3.6 3.5 3.6
## 2 13.0 14.6 11.7 10.7 9.7 8.3 8.2
## X2016..09 X2016..10 X2016..11 X2016..12 X2017..01 X2017..02 X2017..03
## 1 3.5 3.3 3.1 3.2 3.7 4.9 4.1
## 2 7.6 9.2 7.6 8.7 9.9 10.0 11.3
## X2017..04 X2017..05 X2017..06 X2017..07 X2017..08 X2017..09 X2017..10
## 1 4.2 3.6 3.8 3.4 3.6 3.3 3.2
## 2 10.0 5.8 9.2 10.3 9.8 6.3 6.3
## X2017..11 X2017..12 X2018..01 X2018..02
## 1 3.1 3.3 3.7 4.6
## 2 5.7 7.1 11.1 11.7
#names(raw_df)
Data Cleaning
library(tidyr)
library(reshape2)
#reshape: wide to long using by melt
df_m <- melt(raw_df, id.vars = c("By.gender", "By.age.group"))
head(df_m)
## By.gender By.age.group variable value
## 1 Total Total X1999..06 6.7
## 2 Total 15-19 Years old X1999..06 22.8
## 3 Total 20-29 Years old X1999..06 10.3
## 4 Total 30-39 Years old X1999..06 5.8
## 5 Total 40-49 Years old X1999..06 5.6
## 6 Total 50-59 Yeras old X1999..06 5.7
df_m$year <- substr(df_m$variable, 2, 5)
df_m$month <- substr(df_m$variable, 8, 9)
df_m$yr_mos <- paste(df_m$year, df_m$month, sep = "-")
df_m$yr_mos <- as.Date(paste(df_m$yr_mos, "-01", sep = ""))
head(df_m)
## By.gender By.age.group variable value year month yr_mos
## 1 Total Total X1999..06 6.7 1999 06 1999-06-01
## 2 Total 15-19 Years old X1999..06 22.8 1999 06 1999-06-01
## 3 Total 20-29 Years old X1999..06 10.3 1999 06 1999-06-01
## 4 Total 30-39 Years old X1999..06 5.8 1999 06 1999-06-01
## 5 Total 40-49 Years old X1999..06 5.6 1999 06 1999-06-01
## 6 Total 50-59 Yeras old X1999..06 5.7 1999 06 1999-06-01
df_select <- df_m[, c("By.gender", "By.age.group","yr_mos", "value")]
summary(df_select)
## By.gender By.age.group yr_mos
## Female:2250 15-19 Years old: 675 Min. :1999-06-01
## Male :2250 15-24 Years old: 675 1st Qu.:2004-02-01
## Total :2250 15-29 Years old: 675 Median :2008-10-01
## 15-64 Years old: 675 Mean :2008-09-30
## 20-29 Years old: 675 3rd Qu.:2013-06-01
## 30-39 Years old: 675 Max. :2018-02-01
## (Other) :2700
## value
## Min. : 0.200
## 1st Qu.: 2.600
## Median : 3.800
## Mean : 5.466
## 3rd Qu.: 8.200
## Max. :27.700
##
tail(df_select)
## By.gender By.age.group yr_mos value
## 6745 Female 40-49 Years old 2018-02-01 2.7
## 6746 Female 50-59 Yeras old 2018-02-01 2.0
## 6747 Female 60 Years old and over 2018-02-01 9.9
## 6748 Female 15-64 Years old 2018-02-01 4.1
## 6749 Female 15-24 Years old 2018-02-01 11.8
## 6750 Female 15-29 Years old 2018-02-01 9.6
#remove NA
df_cleaned <- df_select[complete.cases(df_select),]
colnames(df_cleaned) <- c("Gender", "Age", "Yr_Month", "Value")
#levels(df_cleaned$Gender)[1] : when change the item
#df_cleaned$new_age <- gsub("Years old", "Years", df_cleaned$Age)
df_10s <- df_cleaned[which(df_cleaned$Age == "15-19 Years old"),]
df_20s <- df_cleaned[which(df_cleaned$Age == "20-29 Years old"),]
df_30s <- df_cleaned[which(df_cleaned$Age == "30-39 Years old"),]
df_40s <- df_cleaned[which(df_cleaned$Age == "40-49 Years old"),]
df_50s <- df_cleaned[which(df_cleaned$Age == "50-59 Yeras old"),]
Data Visualization
library(ggplot2)
plotF <- function(df, title) {
ggplot(df, aes(x = Yr_Month, y = Value, group = Gender, color = Gender)) +
geom_point() +
geom_line() +
ggtitle(title)
}
plotF(df_10s," 15-19")

plotF(df_20s," 20-29")

plotF(df_30s," 30-39")

plotF(df_40s," 40-49")

plotF(df_50s," 50-59")
