#Loading all the libraries
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Read CSV into R
data1<- as.data.frame(read.csv('trade_data.csv',stringsAsFactors=FALSE))
data2<- read.csv('revenue_data.csv',stringsAsFactors=FALSE)
data3<- read.csv('poll_data.csv',stringsAsFactors=FALSE)
Cleansing steps for the trade dataset from Jason Joseph
head(data1)
## Period X X2013 X.1 X2014 X.2 X2015 X.3
## 1 Exports Imports Exports Imports Exports Imports
## 2
## 3 January Goods 131,228 191,745 133,738 193,706 129,292 192,242
## 4 Services 56883 37,995 59,141 38,635 59,733 40,371
## 5
## 6 February Goods 132084 193,731 131,768 193,060 126,329 184,370
#The data contains some empty rows, we will first remove those empty lines
data12<-filter(data1, X2013!="")
head(data12)
## Period X X2013 X.1 X2014 X.2 X2015 X.3
## 1 Exports Imports Exports Imports Exports Imports
## 2 January Goods 131,228 191,745 133,738 193,706 129,292 192,242
## 3 Services 56883 37,995 59,141 38,635 59,733 40,371
## 4 February Goods 132084 193,731 131,768 193,060 126,329 184,370
## 5 Services 56,828 37,935 57,726 39,270 59,560 40,057
## 6 March Goods 130,093 186,326 135,923 198,973 127,183 198,347
#We need to fill some empty observation values in the first column
i<-seq(3,length(data12)+1,2)
j<-i-1
data12[i,1]<-data12[j,1]
data13<-data12
head(data13)
## Period X X2013 X.1 X2014 X.2 X2015 X.3
## 1 Exports Imports Exports Imports Exports Imports
## 2 January Goods 131,228 191,745 133,738 193,706 129,292 192,242
## 3 January Services 56883 37,995 59,141 38,635 59,733 40,371
## 4 February Goods 132084 193,731 131,768 193,060 126,329 184,370
## 5 February Services 56,828 37,935 57,726 39,270 59,560 40,057
## 6 March Goods 130,093 186,326 135,923 198,973 127,183 198,347
#The following step aims at changing the column names, we use the two the existing column names and the first line in the dataset
cname<-names(data13)
cname
## [1] "Period" "X" "X2013" "X.1" "X2014" "X.2" "X2015" "X.3"
i<-seq(4,ncol(data13),2)
j<-i-1
cname[i]<-paste(cname[j],data13[1,i])
cname
## [1] "Period" "X" "X2013" "X2013 Imports"
## [5] "X2014" "X2014 Imports" "X2015" "X2015 Imports"
i<-seq(3,ncol(data13),2)
j<-i-1
cname[i]<-paste(cname[i],data13[1,i])
#We can now remove the first line in the dataset
data13<-filter(data13, X!="")
colnames(data13)<-cname
head(data13)
## Period X X2013 Exports X2013 Imports X2014 Exports
## 1 January Goods 131,228 191,745 133,738
## 2 January Services 56883 37,995 59,141
## 3 February Goods 132084 193,731 131,768
## 4 February Services 56,828 37,935 57,726
## 5 March Goods 130,093 186,326 135,923
## 6 March Services 56,647 37,938 58,837
## X2014 Imports X2015 Exports X2015 Imports
## 1 193,706 129,292 192,242
## 2 38,635 59,733 40,371
## 3 193,060 126,329 184,370
## 4 39,270 59,560 40,057
## 5 198,973 127,183 198,347
## 6 38,908 59,583 40,581
data13<-gather(data13, "Type", "Revenue", 3:ncol(data13))
head(data13)
## Period X Type Revenue
## 1 January Goods X2013 Exports 131,228
## 2 January Services X2013 Exports 56883
## 3 February Goods X2013 Exports 132084
## 4 February Services X2013 Exports 56,828
## 5 March Goods X2013 Exports 130,093
## 6 March Services X2013 Exports 56,647
data13<-data13 %>% separate(Type, c("Year", "Trade"), sep = " ", remove=TRUE)
head(data13)
## Period X Year Trade Revenue
## 1 January Goods X2013 Exports 131,228
## 2 January Services X2013 Exports 56883
## 3 February Goods X2013 Exports 132084
## 4 February Services X2013 Exports 56,828
## 5 March Goods X2013 Exports 130,093
## 6 March Services X2013 Exports 56,647
#We are then going to remove all the X in the year column
data13$Year<-gsub("X","",data13$Year)
data13$Revenue<-gsub("\\,","",data13$Revenue)
#We change the second column's name
colnames(data13)[2]<-"Type"
data13$Type<-gsub("\\s+","",data13$Type)
data13<-spread(data13, Trade, Revenue)
head(data13)
## Period Type Year Exports Imports
## 1 April Goods 2013 131468 189507
## 2 April Goods 2014 135556 199877
## 3 April Goods 2015 129376 190967
## 4 April Services 2013 56592 38318
## 5 April Services 2014 59468 39417
## 6 April Services 2015 59766 40432
#We divide the dataset in two for analysis. One dataset contains the imports and the other dataset contains the exports
import_data<-select(data13, Period,Type,Year,Imports)
head(import_data)
## Period Type Year Imports
## 1 April Goods 2013 189507
## 2 April Goods 2014 199877
## 3 April Goods 2015 190967
## 4 April Services 2013 38318
## 5 April Services 2014 39417
## 6 April Services 2015 40432
export_data<-select(data13,Period:Exports)
head(export_data)
## Period Type Year Exports
## 1 April Goods 2013 131468
## 2 April Goods 2014 135556
## 3 April Goods 2015 129376
## 4 April Services 2013 56592
## 5 April Services 2014 59468
## 6 April Services 2015 59766
The analysis
#1-Calculate the average goods export for each month over the three-year span
exp_mean<-export_data %>% filter(Type == "Goods" ) %>% group_by(Period) %>% summarise(Mean_Exports = mean(as.numeric((Exports))))
exp_mean
## Source: local data frame [4 x 2]
##
## Period Mean_Exports
## (chr) (dbl)
## 1 April 132133.3
## 2 February 130060.3
## 3 January 131419.3
## 4 March 131066.3
#2- Calculate the average goods import for each month over the three-year span
imp_mean<-import_data %>% filter(Type == "Goods" ) %>% group_by(Period) %>% summarise(Mean_Imports = mean(as.numeric((Imports))))
imp_mean
## Source: local data frame [4 x 2]
##
## Period Mean_Imports
## (chr) (dbl)
## 1 April 193450.3
## 2 February 190387.0
## 3 January 192564.3
## 4 March 194548.7
#3- Calculate the average services export for each month over the three-year span
exp_mean<-export_data %>% filter(Type == "Services" ) %>% group_by(Period) %>% summarise(Mean_Exports = mean(as.numeric((Exports))))
exp_mean
## Source: local data frame [4 x 2]
##
## Period Mean_Exports
## (chr) (dbl)
## 1 April 58608.67
## 2 February 58038.00
## 3 January 58585.67
## 4 March 58355.67
#4- Calculate the average services import for each month over the three-year span
imp_mean<-import_data %>% filter(Type == "Services" ) %>% group_by(Period) %>% summarise(Mean_Imports = mean(as.numeric((Imports))))
imp_mean
## Source: local data frame [4 x 2]
##
## Period Mean_Imports
## (chr) (dbl)
## 1 April 39389.00
## 2 February 39087.33
## 3 January 39000.33
## 4 March 39142.33
Cleansing for the second dataset suggested by Nabila Hossain
head(data2)
## X Google Facebook Yahoo Microsoft AOL
## 1 2009 0.36 0.56 1.26 0.37 0.51
## 2 2010 0.86 1.21 1.43 0.51 0.47
## 3 2011 1.67 1.73 1.36 0.60 0.53
## 4 2012 2.26 2.18 1.35 0.90 0.70
## 5 2013 2.99 3.17 1.27 0.79 0.73
#We are using gather to create a Revenue column
data21<-gather(data2, "Company", "Revenue", 2:ncol(data2))
head(data21)
## X Company Revenue
## 1 2009 Google 0.36
## 2 2010 Google 0.86
## 3 2011 Google 1.67
## 4 2012 Google 2.26
## 5 2013 Google 2.99
## 6 2009 Facebook 0.56
#we arrange the revenues by year
data22<-arrange(data21, X, Company, Revenue)
head(data22)
## X Company Revenue
## 1 2009 Google 0.36
## 2 2009 Facebook 0.56
## 3 2009 Yahoo 1.26
## 4 2009 Microsoft 0.37
## 5 2009 AOL 0.51
## 6 2010 Google 0.86
#We change the first column's name
colnames(data22)[1]<-"Year"
data22$Company<-gsub("\\s+","",data22$Company)
Analysis
yearly <- group_by(data22, Year)
(per_year <- summarise(yearly, Total=sum(Revenue)))
## Source: local data frame [5 x 2]
##
## Year Total
## (int) (dbl)
## 1 2009 3.06
## 2 2010 4.48
## 3 2011 5.89
## 4 2012 7.39
## 5 2013 8.95
data23<-arrange(data22, Company, Year, Revenue)
head(data23)
## Year Company Revenue
## 1 2009 AOL 0.51
## 2 2010 AOL 0.47
## 3 2011 AOL 0.53
## 4 2012 AOL 0.70
## 5 2013 AOL 0.73
## 6 2009 Facebook 0.56
#1- Total revenue by year and company
tran <- group_by(data23, Year,Company)
(tran1 <- summarise(tran, Total=sum(Revenue)))
## Source: local data frame [25 x 3]
## Groups: Year [?]
##
## Year Company Total
## (int) (chr) (dbl)
## 1 2009 AOL 0.51
## 2 2009 Facebook 0.56
## 3 2009 Google 0.36
## 4 2009 Microsoft 0.37
## 5 2009 Yahoo 1.26
## 6 2010 AOL 0.47
## 7 2010 Facebook 1.21
## 8 2010 Google 0.86
## 9 2010 Microsoft 0.51
## 10 2010 Yahoo 1.43
## .. ... ... ...
#2- Average revenue by year and company
tran <- group_by(data23, Year, Company)
(tran1 <- summarise(tran, Avg_Revenue=mean(Revenue)))
## Source: local data frame [25 x 3]
## Groups: Year [?]
##
## Year Company Avg_Revenue
## (int) (chr) (dbl)
## 1 2009 AOL 0.51
## 2 2009 Facebook 0.56
## 3 2009 Google 0.36
## 4 2009 Microsoft 0.37
## 5 2009 Yahoo 1.26
## 6 2010 AOL 0.47
## 7 2010 Facebook 1.21
## 8 2010 Google 0.86
## 9 2010 Microsoft 0.51
## 10 2010 Yahoo 1.43
## .. ... ... ...
#3- Compare the revenue of any two or three companies
(data24<-filter(data23, Company=="AOL"| Company == "Google"|Company=="Yahoo"))
## Year Company Revenue
## 1 2009 AOL 0.51
## 2 2010 AOL 0.47
## 3 2011 AOL 0.53
## 4 2012 AOL 0.70
## 5 2013 AOL 0.73
## 6 2009 Google 0.36
## 7 2010 Google 0.86
## 8 2011 Google 1.67
## 9 2012 Google 2.26
## 10 2013 Google 2.99
## 11 2009 Yahoo 1.26
## 12 2010 Yahoo 1.43
## 13 2011 Yahoo 1.36
## 14 2012 Yahoo 1.35
## 15 2013 Yahoo 1.27
qplot(Year,Revenue, data=data24, color=Company)

Cleansing for the third dataset suggested by myself
head(data3)
## Poll Date Trump Carson Fiorina Rubio Bush Cruz Kasich
## 1 PPP (D) 10/1 - 10/4 27 17 6 13 10 7 4
## 2 IBD/TIPP 9/26 - 10/1 17 24 9 11 8 6 4
## 3 USAT/Suffolk 9/24 - 9/28 23 13 13 9 8 6 2
## 4 Pew Research* 9/22 - 9/27 25 16 8 8 4 6 1
## 5 NBC/WSJ 9/20 - 9/24 21 20 11 11 7 5 6
## 6 FOX News 9/20 - 9/22 26 18 9 9 7 8 4
## Huckabee Christie Paul Santorum Jindal Graham Pataki
## 1 4 2 2 2 1 1 1
## 2 2 2 3 0 1 0 0
## 3 2 1 2 0 1 1 0
## 4 2 1 2 0 0 1 0
## 5 2 3 3 1 1 0 0
## 6 3 5 2 0 0 0 1
#We are creating a column called Score with gather
data31<-gather(data3, "Candidate", "Score", 3:ncol(data3))
head(data31)
## Poll Date Candidate Score
## 1 PPP (D) 10/1 - 10/4 Trump 27
## 2 IBD/TIPP 9/26 - 10/1 Trump 17
## 3 USAT/Suffolk 9/24 - 9/28 Trump 23
## 4 Pew Research* 9/22 - 9/27 Trump 25
## 5 NBC/WSJ 9/20 - 9/24 Trump 21
## 6 FOX News 9/20 - 9/22 Trump 26
Analysis
perf <- group_by(data31, Candidate)
(perf1 <- summarise(perf, Avg_Points=round(mean(Score),2)))
## Source: local data frame [14 x 2]
##
## Candidate Avg_Points
## (fctr) (dbl)
## 1 Trump 25.22
## 2 Carson 16.17
## 3 Fiorina 7.72
## 4 Rubio 8.11
## 5 Bush 8.61
## 6 Cruz 6.56
## 7 Kasich 3.50
## 8 Huckabee 3.61
## 9 Christie 2.50
## 10 Paul 2.78
## 11 Santorum 0.78
## 12 Jindal 0.50
## 13 Graham 0.28
## 14 Pataki 0.22
perf <- group_by(data31, Candidate, Poll)
(perf1 <- summarise(perf, Avg_Points=round(mean(Score),2)))
## Source: local data frame [182 x 3]
## Groups: Candidate [?]
##
## Candidate Poll Avg_Points
## (fctr) (chr) (dbl)
## 1 Trump ABC/Wash Post 33.00
## 2 Trump Bloomberg 21.00
## 3 Trump CBS/NYT 27.00
## 4 Trump CNN/ORC 26.67
## 5 Trump FOX News 25.50
## 6 Trump IBD/TIPP 17.00
## 7 Trump Monmouth 30.00
## 8 Trump NBC/WSJ 21.00
## 9 Trump Pew Research* 25.00
## 10 Trump PPP (D) 28.00
## .. ... ... ...
qplot(Score,Candidate, data=data31, color=Poll)
