#Loading all the libraries
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
# Read CSV into R
data1<- as.data.frame(read.csv('trade_data.csv',stringsAsFactors=FALSE))
data2<- read.csv('revenue_data.csv',stringsAsFactors=FALSE)
data3<- read.csv('poll_data.csv',stringsAsFactors=FALSE)

Cleansing steps for the trade dataset from Jason Joseph

head(data1)
##     Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                   Exports Imports Exports Imports Exports Imports
## 2                                                                  
## 3  January   Goods  131,228 191,745 133,738 193,706 129,292 192,242
## 4          Services   56883  37,995  59,141  38,635  59,733  40,371
## 5                                                                  
## 6 February   Goods   132084 193,731 131,768 193,060 126,329 184,370
#The data contains some empty rows, we will first remove those empty lines
data12<-filter(data1, X2013!="")
head(data12)
##     Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                   Exports Imports Exports Imports Exports Imports
## 2  January   Goods  131,228 191,745 133,738 193,706 129,292 192,242
## 3          Services   56883  37,995  59,141  38,635  59,733  40,371
## 4 February   Goods   132084 193,731 131,768 193,060 126,329 184,370
## 5          Services  56,828  37,935  57,726  39,270  59,560  40,057
## 6    March   Goods  130,093 186,326 135,923 198,973 127,183 198,347
#We need to fill some empty observation values in the first column
i<-seq(3,length(data12)+1,2)
j<-i-1
data12[i,1]<-data12[j,1]
data13<-data12
head(data13)
##     Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                   Exports Imports Exports Imports Exports Imports
## 2  January   Goods  131,228 191,745 133,738 193,706 129,292 192,242
## 3  January Services   56883  37,995  59,141  38,635  59,733  40,371
## 4 February   Goods   132084 193,731 131,768 193,060 126,329 184,370
## 5 February Services  56,828  37,935  57,726  39,270  59,560  40,057
## 6    March   Goods  130,093 186,326 135,923 198,973 127,183 198,347
#The following step aims at changing the column names, we use the two the existing column names and the first line in the dataset
cname<-names(data13)
cname
## [1] "Period" "X"      "X2013"  "X.1"    "X2014"  "X.2"    "X2015"  "X.3"
i<-seq(4,ncol(data13),2)
j<-i-1
cname[i]<-paste(cname[j],data13[1,i])
cname
## [1] "Period"        "X"             "X2013"         "X2013 Imports"
## [5] "X2014"         "X2014 Imports" "X2015"         "X2015 Imports"
i<-seq(3,ncol(data13),2)
j<-i-1
cname[i]<-paste(cname[i],data13[1,i])

#We can now remove the first line in the dataset
data13<-filter(data13, X!="")
colnames(data13)<-cname

head(data13)
##     Period        X X2013 Exports X2013 Imports X2014 Exports
## 1  January   Goods        131,228       191,745       133,738
## 2  January Services         56883        37,995        59,141
## 3 February   Goods         132084       193,731       131,768
## 4 February Services        56,828        37,935        57,726
## 5    March   Goods        130,093       186,326       135,923
## 6    March Services        56,647        37,938        58,837
##   X2014 Imports X2015 Exports X2015 Imports
## 1       193,706       129,292       192,242
## 2        38,635        59,733        40,371
## 3       193,060       126,329       184,370
## 4        39,270        59,560        40,057
## 5       198,973       127,183       198,347
## 6        38,908        59,583        40,581
data13<-gather(data13, "Type", "Revenue", 3:ncol(data13))
head(data13)
##     Period        X          Type Revenue
## 1  January   Goods  X2013 Exports 131,228
## 2  January Services X2013 Exports   56883
## 3 February   Goods  X2013 Exports  132084
## 4 February Services X2013 Exports  56,828
## 5    March   Goods  X2013 Exports 130,093
## 6    March Services X2013 Exports  56,647
data13<-data13 %>% separate(Type, c("Year", "Trade"), sep = " ", remove=TRUE)
head(data13)
##     Period        X  Year   Trade Revenue
## 1  January   Goods  X2013 Exports 131,228
## 2  January Services X2013 Exports   56883
## 3 February   Goods  X2013 Exports  132084
## 4 February Services X2013 Exports  56,828
## 5    March   Goods  X2013 Exports 130,093
## 6    March Services X2013 Exports  56,647
#We are then going to remove all the X in the year column
data13$Year<-gsub("X","",data13$Year)
data13$Revenue<-gsub("\\,","",data13$Revenue)
#We change the second column's name
colnames(data13)[2]<-"Type"
data13$Type<-gsub("\\s+","",data13$Type) 
data13<-spread(data13, Trade, Revenue)
head(data13)
##   Period     Type Year Exports Imports
## 1  April    Goods 2013  131468  189507
## 2  April    Goods 2014  135556  199877
## 3  April    Goods 2015  129376  190967
## 4  April Services 2013   56592   38318
## 5  April Services 2014   59468   39417
## 6  April Services 2015   59766   40432
#We divide the dataset in two for analysis. One dataset contains the imports and the other dataset contains the exports 
import_data<-select(data13, Period,Type,Year,Imports)
head(import_data)
##   Period     Type Year Imports
## 1  April    Goods 2013  189507
## 2  April    Goods 2014  199877
## 3  April    Goods 2015  190967
## 4  April Services 2013   38318
## 5  April Services 2014   39417
## 6  April Services 2015   40432
export_data<-select(data13,Period:Exports)
head(export_data)
##   Period     Type Year Exports
## 1  April    Goods 2013  131468
## 2  April    Goods 2014  135556
## 3  April    Goods 2015  129376
## 4  April Services 2013   56592
## 5  April Services 2014   59468
## 6  April Services 2015   59766

The analysis

#1-Calculate the average goods export for each month over the three-year span
exp_mean<-export_data %>% filter(Type == "Goods" ) %>% group_by(Period) %>% summarise(Mean_Exports = mean(as.numeric((Exports))))
exp_mean
## Source: local data frame [4 x 2]
## 
##     Period Mean_Exports
##      (chr)        (dbl)
## 1    April     132133.3
## 2 February     130060.3
## 3  January     131419.3
## 4    March     131066.3
#2- Calculate the average goods import for each month over the three-year span
imp_mean<-import_data %>% filter(Type == "Goods" ) %>% group_by(Period) %>% summarise(Mean_Imports = mean(as.numeric((Imports))))
imp_mean
## Source: local data frame [4 x 2]
## 
##     Period Mean_Imports
##      (chr)        (dbl)
## 1    April     193450.3
## 2 February     190387.0
## 3  January     192564.3
## 4    March     194548.7
#3- Calculate the average services export for each month over the three-year span
exp_mean<-export_data %>% filter(Type == "Services" ) %>% group_by(Period) %>% summarise(Mean_Exports = mean(as.numeric((Exports))))
exp_mean
## Source: local data frame [4 x 2]
## 
##     Period Mean_Exports
##      (chr)        (dbl)
## 1    April     58608.67
## 2 February     58038.00
## 3  January     58585.67
## 4    March     58355.67
#4- Calculate the average services import for each month over the three-year span
imp_mean<-import_data %>% filter(Type == "Services" ) %>% group_by(Period) %>% summarise(Mean_Imports = mean(as.numeric((Imports))))
imp_mean
## Source: local data frame [4 x 2]
## 
##     Period Mean_Imports
##      (chr)        (dbl)
## 1    April     39389.00
## 2 February     39087.33
## 3  January     39000.33
## 4    March     39142.33

Cleansing for the second dataset suggested by Nabila Hossain

head(data2)
##      X Google Facebook Yahoo Microsoft  AOL
## 1 2009   0.36     0.56  1.26      0.37 0.51
## 2 2010   0.86     1.21  1.43      0.51 0.47
## 3 2011   1.67     1.73  1.36      0.60 0.53
## 4 2012   2.26     2.18  1.35      0.90 0.70
## 5 2013   2.99     3.17  1.27      0.79 0.73
#We are using gather to create a Revenue column
data21<-gather(data2, "Company", "Revenue", 2:ncol(data2))
head(data21)
##      X  Company Revenue
## 1 2009   Google    0.36
## 2 2010   Google    0.86
## 3 2011   Google    1.67
## 4 2012   Google    2.26
## 5 2013   Google    2.99
## 6 2009 Facebook    0.56
#we arrange the revenues by year
data22<-arrange(data21, X, Company, Revenue)
head(data22)
##      X   Company Revenue
## 1 2009    Google    0.36
## 2 2009  Facebook    0.56
## 3 2009     Yahoo    1.26
## 4 2009 Microsoft    0.37
## 5 2009       AOL    0.51
## 6 2010    Google    0.86
#We change the first column's name
colnames(data22)[1]<-"Year"
data22$Company<-gsub("\\s+","",data22$Company)

Analysis

yearly <- group_by(data22, Year)
(per_year <- summarise(yearly, Total=sum(Revenue)))
## Source: local data frame [5 x 2]
## 
##    Year Total
##   (int) (dbl)
## 1  2009  3.06
## 2  2010  4.48
## 3  2011  5.89
## 4  2012  7.39
## 5  2013  8.95
data23<-arrange(data22, Company, Year, Revenue)
head(data23)
##   Year  Company Revenue
## 1 2009      AOL    0.51
## 2 2010      AOL    0.47
## 3 2011      AOL    0.53
## 4 2012      AOL    0.70
## 5 2013      AOL    0.73
## 6 2009 Facebook    0.56
#1- Total revenue by year and company 
tran <- group_by(data23, Year,Company)
(tran1 <- summarise(tran, Total=sum(Revenue)))
## Source: local data frame [25 x 3]
## Groups: Year [?]
## 
##     Year   Company Total
##    (int)     (chr) (dbl)
## 1   2009       AOL  0.51
## 2   2009  Facebook  0.56
## 3   2009    Google  0.36
## 4   2009 Microsoft  0.37
## 5   2009     Yahoo  1.26
## 6   2010       AOL  0.47
## 7   2010  Facebook  1.21
## 8   2010    Google  0.86
## 9   2010 Microsoft  0.51
## 10  2010     Yahoo  1.43
## ..   ...       ...   ...
#2- Average revenue by year and company
tran <- group_by(data23, Year, Company)
(tran1 <- summarise(tran, Avg_Revenue=mean(Revenue)))
## Source: local data frame [25 x 3]
## Groups: Year [?]
## 
##     Year   Company Avg_Revenue
##    (int)     (chr)       (dbl)
## 1   2009       AOL        0.51
## 2   2009  Facebook        0.56
## 3   2009    Google        0.36
## 4   2009 Microsoft        0.37
## 5   2009     Yahoo        1.26
## 6   2010       AOL        0.47
## 7   2010  Facebook        1.21
## 8   2010    Google        0.86
## 9   2010 Microsoft        0.51
## 10  2010     Yahoo        1.43
## ..   ...       ...         ...
#3- Compare the revenue of any two or three companies
(data24<-filter(data23, Company=="AOL"| Company == "Google"|Company=="Yahoo"))
##    Year Company Revenue
## 1  2009     AOL    0.51
## 2  2010     AOL    0.47
## 3  2011     AOL    0.53
## 4  2012     AOL    0.70
## 5  2013     AOL    0.73
## 6  2009  Google    0.36
## 7  2010  Google    0.86
## 8  2011  Google    1.67
## 9  2012  Google    2.26
## 10 2013  Google    2.99
## 11 2009   Yahoo    1.26
## 12 2010   Yahoo    1.43
## 13 2011   Yahoo    1.36
## 14 2012   Yahoo    1.35
## 15 2013   Yahoo    1.27
qplot(Year,Revenue, data=data24, color=Company)

Cleansing for the third dataset suggested by myself

head(data3)
##            Poll        Date Trump Carson Fiorina Rubio Bush Cruz Kasich
## 1       PPP (D) 10/1 - 10/4    27     17       6    13   10    7      4
## 2      IBD/TIPP 9/26 - 10/1    17     24       9    11    8    6      4
## 3  USAT/Suffolk 9/24 - 9/28    23     13      13     9    8    6      2
## 4 Pew Research* 9/22 - 9/27    25     16       8     8    4    6      1
## 5       NBC/WSJ 9/20 - 9/24    21     20      11    11    7    5      6
## 6      FOX News 9/20 - 9/22    26     18       9     9    7    8      4
##   Huckabee Christie Paul Santorum Jindal Graham Pataki
## 1        4        2    2        2      1      1      1
## 2        2        2    3        0      1      0      0
## 3        2        1    2        0      1      1      0
## 4        2        1    2        0      0      1      0
## 5        2        3    3        1      1      0      0
## 6        3        5    2        0      0      0      1
#We are creating a column called Score with gather
data31<-gather(data3, "Candidate", "Score", 3:ncol(data3))
head(data31)
##            Poll        Date Candidate Score
## 1       PPP (D) 10/1 - 10/4     Trump    27
## 2      IBD/TIPP 9/26 - 10/1     Trump    17
## 3  USAT/Suffolk 9/24 - 9/28     Trump    23
## 4 Pew Research* 9/22 - 9/27     Trump    25
## 5       NBC/WSJ 9/20 - 9/24     Trump    21
## 6      FOX News 9/20 - 9/22     Trump    26

Analysis

perf <- group_by(data31, Candidate)
(perf1 <- summarise(perf, Avg_Points=round(mean(Score),2)))
## Source: local data frame [14 x 2]
## 
##    Candidate Avg_Points
##       (fctr)      (dbl)
## 1      Trump      25.22
## 2     Carson      16.17
## 3    Fiorina       7.72
## 4      Rubio       8.11
## 5       Bush       8.61
## 6       Cruz       6.56
## 7     Kasich       3.50
## 8   Huckabee       3.61
## 9   Christie       2.50
## 10      Paul       2.78
## 11  Santorum       0.78
## 12    Jindal       0.50
## 13    Graham       0.28
## 14    Pataki       0.22
perf <- group_by(data31, Candidate, Poll)
(perf1 <- summarise(perf, Avg_Points=round(mean(Score),2)))
## Source: local data frame [182 x 3]
## Groups: Candidate [?]
## 
##    Candidate          Poll Avg_Points
##       (fctr)         (chr)      (dbl)
## 1      Trump ABC/Wash Post      33.00
## 2      Trump     Bloomberg      21.00
## 3      Trump       CBS/NYT      27.00
## 4      Trump       CNN/ORC      26.67
## 5      Trump      FOX News      25.50
## 6      Trump      IBD/TIPP      17.00
## 7      Trump      Monmouth      30.00
## 8      Trump       NBC/WSJ      21.00
## 9      Trump Pew Research*      25.00
## 10     Trump       PPP (D)      28.00
## ..       ...           ...        ...
qplot(Score,Candidate, data=data31, color=Poll)