This page demonstrates three different ways of tidying up the data.
1. Natural way - Using Tidyr package
2. Fast(in terms of runtime) way - Using data.table
3. Hard way - Without using any built-in function/package
Most of the operations are explained as code comments within the code chunck.
I choose Jason Joseph’s problem as that was a good wide data candidate for Tidyr package.
Jason has extracted the data from http://www.census.gov/foreign-trade/data/index.html and provided a nice sample file. Thanks Jason for the csv.
library(tidyr)
library(dplyr)
library(knitr)
trade_wide=read.table(file="https://raw.githubusercontent.com/mkds/MSDA/master/IS607/data/International Trade in Goods and Services.csv",stringsAsFactors = F,header = T,na.strings = "",sep = ",")
print(trade_wide)
## Period X X2013 X.1 X2014 X.2 X2015 X.3
## 1 <NA> <NA> Exports Imports Exports Imports Exports Imports
## 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 January Goods 131,228 191,745 133,738 193,706 129,292 192,242
## 4 <NA> Services 56883 37,995 59,141 38,635 59,733 40,371
## 5 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 6 February Goods 132084 193,731 131,768 193,060 126,329 184,370
## 7 <NA> Services 56,828 37,935 57,726 39,270 59,560 40,057
## 8 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 9 March Goods 130,093 186,326 135,923 198,973 127,183 198,347
## 10 <NA> Services 56,647 37,938 58,837 38,908 59,583 40,581
## 11 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 12 April Goods 131,468 189,507 135,556 199,877 129,376 190,967
## 13 <NA> Services 56,592 38,318 59,468 39,417 59,766 40,432
#Remove Rows which has all the values as NA
trade_wide=trade_wide[rowSums(is.na(trade_wide))!=8,]
#Rename the columns in line with the values of the column
colnames(trade_wide)=c("Month","Type","Exports 2013","Imports 2013","Exports 2014","Imports 2014","Exports 2015","Imports 2015")
#Now let's use tidyr and dplyr to transform the data
trade=trade_wide %>%
gather(cat,value,3:8) %>% #Collapse column 3 to 8 into two column (key and value pair)
separate(cat,c("Trade","year")) %>% #Split the Trade type and year which is stored in one column to two separate columns
fill(1) %>% #Fill missing values with previous row values
filter(!is.na(Type)) %>% #Remove rows with type NA
mutate(value=as.numeric(gsub(",","",value))) #Remove thousand separator in numbers
head(trade)
## Month Type Trade year value
## 1 January Goods Exports 2013 131228
## 2 January Services Exports 2013 56883
## 3 February Goods Exports 2013 132084
## 4 February Services Exports 2013 56828
## 5 March Goods Exports 2013 130093
## 6 March Services Exports 2013 56647
#Now that we got the data in desired format, let's answer the question average trade by month and type
kable(trade %>% group_by(Month,Type) %>% summarise(Avg=mean(value)))
| Month | Type | Avg |
|---|---|---|
| April | Goods | 162791.83 |
| April | Services | 48998.83 |
| February | Goods | 160223.67 |
| February | Services | 48562.67 |
| January | Goods | 161991.83 |
| January | Services | 48793.00 |
| March | Goods | 162807.50 |
| March | Services | 48749.00 |
For this approach I choose to work with the data provided by Christina Taylor. This data require some transformation and join and so I wanted to try that out in data.table. In general data.table operations tend to execute much faster than tidyr.
library(data.table)
library(ggplot2)
jobopening=fread("https://raw.githubusercontent.com/mkds/MSDA/master/IS607/data/JobOpen.csv") #Note: fread is faster than read.csv and does a better job in detecting separator and column classes
head(jobopening)
## Opening ID Role Location Recruiter Target Open Date
## 1: 1 Administration US Bilbo 11/20/2014
## 2: 2 Administration US Jerry 11/18/2014
## 3: 3 Administration US Gandalf 11/30/2014
## 4: 4 Administration US Thorin Oakenshield 10/4/2014
## 5: 5 Administration US Shamus 10/13/2014
## 6: 6 Administration US Balin 12/2/2014
## Target Start Date Actual Open Date Offer Accepted Date
## 1: 2/18/2015 11/23/2014 12/25/2014
## 2: 2/16/2015 11/23/2014 12/14/2014
## 3: 2/28/2015 12/6/2014 2/10/2015
## 4: 1/2/2015 10/4/2014 12/23/2014
## 5: 1/11/2015 10/22/2014 11/19/2014
## 6: 3/2/2015 12/8/2014 2/12/2015
## Actual Start Date employee ID of hired guy
## 1: 1/7/2015 1
## 2: 12/21/2014 2
## 3: 2/20/2015 3
## 4: 12/30/2014 4
## 5: 11/28/2014 5
## 6: 2/22/2015 6
jobopening[,`Actual Start Date`:=as.Date(`Actual Start Date`,"%m/%d/%Y")] #Convert date string to Date type
#Compute Quater and year of actual start Date
jobopening[,`:=`(Qtr=paste0("Q",(as.numeric(format(`Actual Start Date`,"%m"))-1)%/%3+1),Year=format(`Actual Start Date`,"%Y"))]
head(jobopening)
## Opening ID Role Location Recruiter Target Open Date
## 1: 1 Administration US Bilbo 11/20/2014
## 2: 2 Administration US Jerry 11/18/2014
## 3: 3 Administration US Gandalf 11/30/2014
## 4: 4 Administration US Thorin Oakenshield 10/4/2014
## 5: 5 Administration US Shamus 10/13/2014
## 6: 6 Administration US Balin 12/2/2014
## Target Start Date Actual Open Date Offer Accepted Date
## 1: 2/18/2015 11/23/2014 12/25/2014
## 2: 2/16/2015 11/23/2014 12/14/2014
## 3: 2/28/2015 12/6/2014 2/10/2015
## 4: 1/2/2015 10/4/2014 12/23/2014
## 5: 1/11/2015 10/22/2014 11/19/2014
## 6: 3/2/2015 12/8/2014 2/12/2015
## Actual Start Date employee ID of hired guy Qtr Year
## 1: 2015-01-07 1 Q1 2015
## 2: 2014-12-21 2 Q4 2014
## 3: 2015-02-20 3 Q1 2015
## 4: 2014-12-30 4 Q4 2014
## 5: 2014-11-28 5 Q4 2014
## 6: 2015-02-22 6 Q1 2015
hctarget=fread("https://raw.githubusercontent.com/mkds/MSDA/master/IS607/data/hctarget.csv")[-.N]
head(hctarget)
## Role Location Current_HeadCount Q1_2015 Q2_2015 Q3_2015
## 1: Administration US 33 39 47 55
## 2: Administration DZ 33 39 47 55
## 3: Administration GF 34 40 48 57
## 4: Sales US 66 78 93 111
## 5: Sales DZ 66 78 93 111
## 6: Sales GF 67 80 95 113
## Q4_2015
## 1: 66
## 2: 66
## 3: 68
## 4: 132
## 5: 132
## 6: 134
#Collapse Quaterly columns to Qtr/Value column
hctarget=melt(hctarget,measure.vars = c("Q1_2015","Q2_2015","Q3_2015","Q4_2015"),variable.name="Qtr",value.name="Target")
#Split Qtr column to Qtr/Year
hctarget[,c("Qtr","Year"):=tstrsplit(Qtr,"_")]
head(hctarget)
## Role Location Current_HeadCount Qtr Target Year
## 1: Administration US 33 Q1 39 2015
## 2: Administration DZ 33 Q1 39 2015
## 3: Administration GF 34 Q1 40 2015
## 4: Sales US 66 Q1 78 2015
## 5: Sales DZ 66 Q1 78 2015
## 6: Sales GF 67 Q1 80 2015
#Set Key values so that we can do data table joins
setkey(jobopening,Role,Location,Qtr,Year)
setkey(hctarget,Role,Location,Qtr,Year)
#Compute total hires and extract only required column
hired=jobopening[!is.na(Year),.(hires=.N),by=.(Role,Location,Qtr,Year)]
#Join hired data and target data
results=hired[hctarget]
#Convert NAs to zero
results$hires[is.na(results$hires)]=0
head(results)
## Role Location Qtr Year hires Current_HeadCount Target
## 1: Administration DZ Q1 2015 6 33 39
## 2: Administration DZ Q2 2015 3 33 47
## 3: Administration DZ Q3 2015 0 33 55
## 4: Administration DZ Q4 2015 0 33 66
## 5: Administration GF Q1 2015 7 34 40
## 6: Administration GF Q2 2015 4 34 48
#Heat Map of (Actual Vs Target) by year and Quarter using ggplot facet
ggplot(results,aes(Location, Role)) + geom_tile(aes(fill=hires/(Target-Current_HeadCount)*100)) + scale_fill_gradient2(low="white",high="green") + facet_grid(~Year+Qtr) + labs(fill="Achieved%") + theme(axis.text.x=element_text(size=12),axis.text.y=element_text(size=12))
What if the data that we get is so messy and we couldn’t use any package? I wanted to try and tidy up data without using any package and see how it goes. I choose the data posted by Maxwell Wagner Th data is sourced from https://en.wikipedia.org/wiki/Motocross_des_Nations.
For this problem I came up with solution assuming that tidyr and ddplyr are not available and I have to do the tidy part myself.
I defined a function that pretty much performs the tidyr::gather function
my_gather=function(x,cols,keyname,valuename){
all_keyvalue=NULL
for (y in cols){c
varname=colnames(x)[y]
keyvalue=cbind(varname,x[,y,with=FALSE],x[,.I])
colnames(keyvalue)=c(keyname,valuename,"row")#The extra row column is to keep track of the row to which this column needs to be attached to
all_keyvalue=rbind(all_keyvalue,keyvalue)
}
all_keyvalue
}
Then, I performed the data cleansing and tidying as shown below
moto_wide=fread("https://raw.githubusercontent.com/mkds/MSDA/master/IS607/data/motocross.csv",stringsAsFactors = F)
head(moto_wide)
## Team \n2000 \n2001 \n2002 \n2003 \n2004 \n2005 \n2006
## 1: Australia 7th 6th 14th 13th 16th 11th
## 2: Austria 13th 22nd 23rd 21st
## 3: Belarus 27th
## 4: Belgium 3rd 2nd 2nd 1st 1st 3rd 2nd
## 5: Brazil 23rd 23rd
## 6: Canada 28th 26th 8th 17th 18th
## \n2007 \n2008 \n2009 \n2010 \n2011 \n2012 \n2013 \n2014 \n2015
## 1: 20th 6th 7th 6th 3rd 10th 4th 14th 7th
## 2: 23rd 12th 20th 24th 12th 10th
## 3:
## 4: 3rd 3rd 3rd 2nd 5th 2nd 1st 2nd 3rd
## 5: 16th 14th 14th 18th 27th 32nd 31st 27th 27th
## 6: 10th 13th 27th 16th 23rd 17th
keyvalues=my_gather(moto_wide,2:17,"year","position")#Collapse columns to key value pair
head(keyvalues)
## year position row
## 1: \n2000 7th 1
## 2: \n2000 13th 2
## 3: \n2000 3
## 4: \n2000 3rd 4
## 5: \n2000 23rd 5
## 6: \n2000 28th 6
#Extract Numbers from the strings
keyvalues[,`:=`(year=as.numeric(gsub("\n","",year)),position=as.numeric(gsub("(\\d+).*","\\1",position)))]
#Add row number as column so that we can join by row
moto_wide[,row:=.I]
setkey(moto_wide,row)
setkey(keyvalues,row)
#Join keyvalues with the required values from original data
moto=keyvalues[moto_wide[,.(Team,row)]][,.(Team,year,position)]
head(moto)
## Team year position
## 1: Australia 2000 7
## 2: Australia 2001 6
## 3: Australia 2002 NA
## 4: Australia 2003 14
## 5: Australia 2004 13
## 6: Australia 2005 16
#Note that all above function could be performed easily using tidyr as given below
# moto=moto_wide %>% gather(year,position,2:17) %>%
# mutate(year=as.numeric(gsub("X..","",year)),position=as.numeric(gsub("(\\d+).*","\\1",position)))
#
#Now that we got the long data let's try to answer the question of average finishes and average standing
kable(moto %>%
group_by(Team) %>%
summarize(finishes=sum(!is.na(position)),Avg_position=mean(position,na.rm=T)) %>%
arrange(Avg_position))
| Team | finishes | Avg_position |
|---|---|---|
| USA | 13 | 1.538461 |
| Belgium | 16 | 2.375000 |
| France | 16 | 4.500000 |
| Great Britain | 15 | 6.133333 |
| Italy | 16 | 8.312500 |
| Germany | 16 | 8.937500 |
| Australia | 15 | 9.600000 |
| Estonia | 16 | 9.687500 |
| South Africa | 11 | 10.454546 |
| New Zealand | 14 | 11.571429 |
| Spain | 16 | 12.625000 |
| Finland | 15 | 12.866667 |
| Switzerland | 15 | 13.133333 |
| Netherlands | 13 | 13.461538 |
| Portugal | 16 | 13.875000 |
| Japan | 13 | 15.384615 |
| Czech Republic | 15 | 16.600000 |
| Ireland | 16 | 16.750000 |
| Sweden | 16 | 16.750000 |
| Denmark | 14 | 16.928571 |
| Latvia | 16 | 17.875000 |
| Austria | 10 | 18.000000 |
| Chile | 1 | 18.000000 |
| Russia | 13 | 18.153846 |
| Canada | 11 | 18.454545 |
| Slovenia | 14 | 21.071429 |
| Ecuador | 2 | 21.500000 |
| Norway | 11 | 21.727273 |
| Puerto Rico | 9 | 22.444444 |
| Brazil | 11 | 22.909091 |
| Costa Rica | 2 | 25.000000 |
| Guatemala | 4 | 25.250000 |
| Poland | 3 | 25.666667 |
| Morocco | 2 | 26.000000 |
| Slovakia | 8 | 26.000000 |
| Venezuela | 10 | 26.200000 |
| Belarus | 1 | 27.000000 |
| Mexico | 2 | 27.000000 |
| Hungary | 8 | 27.750000 |
| Croatia | 10 | 27.800000 |
| Lithuania | 11 | 27.818182 |
| Ukraine | 8 | 27.875000 |
| Philippines | 1 | 29.000000 |
| Dominican Republic | 1 | 30.000000 |
| Greece | 10 | 30.500000 |
| Iceland | 7 | 31.142857 |
| Romania | 1 | 32.000000 |
| China | 2 | 32.500000 |
| Cyprus | 2 | 32.500000 |
| Mongolia | 2 | 32.500000 |
| Luxembourg | 4 | 32.750000 |
| Thailand | 4 | 35.000000 |
| Kuwait | 1 | 36.000000 |
| Israel | 3 | 36.666667 |