Project 2

#Loading all the libraries
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
# Read CSV into R
data1<- as.data.frame(read.csv('trade_data.csv',stringsAsFactors=FALSE))
data2<- read.csv('revenue_data.csv',stringsAsFactors=FALSE)
data3<- read.csv('poll_data.csv',stringsAsFactors=FALSE)

Cleansing steps for the trade dataset from Jason Joseph

head(data1)

##     Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                   Exports Imports Exports Imports Exports Imports
## 2                                                                  
## 3  January   Goods  131,228 191,745 133,738 193,706 129,292 192,242
## 4          Services   56883  37,995  59,141  38,635  59,733  40,371
## 5                                                                  
## 6 February   Goods   132084 193,731 131,768 193,060 126,329 184,370

#The data contains some empty rows, we will first remove those empty lines
data12<-filter(data1, X2013!="")
head(data12)

##     Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                   Exports Imports Exports Imports Exports Imports
## 2  January   Goods  131,228 191,745 133,738 193,706 129,292 192,242
## 3          Services   56883  37,995  59,141  38,635  59,733  40,371
## 4 February   Goods   132084 193,731 131,768 193,060 126,329 184,370
## 5          Services  56,828  37,935  57,726  39,270  59,560  40,057
## 6    March   Goods  130,093 186,326 135,923 198,973 127,183 198,347

#We need to fill some empty observation values in the first column
i<-seq(3,length(data12)+1,2)
j<-i-1
data12[i,1]<-data12[j,1]
data13<-data12
head(data13)

##     Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                   Exports Imports Exports Imports Exports Imports
## 2  January   Goods  131,228 191,745 133,738 193,706 129,292 192,242
## 3  January Services   56883  37,995  59,141  38,635  59,733  40,371
## 4 February   Goods   132084 193,731 131,768 193,060 126,329 184,370
## 5 February Services  56,828  37,935  57,726  39,270  59,560  40,057
## 6    March   Goods  130,093 186,326 135,923 198,973 127,183 198,347

#The following step aims at changing the column names, we use the two the existing column names and the first line in the dataset
cname<-names(data13)
cname

## [1] "Period" "X"      "X2013"  "X.1"    "X2014"  "X.2"    "X2015"  "X.3"

i<-seq(4,ncol(data13),2)
j<-i-1
cname[i]<-paste(cname[j],data13[1,i])
cname

## [1] "Period"        "X"             "X2013"         "X2013 Imports"
## [5] "X2014"         "X2014 Imports" "X2015"         "X2015 Imports"

i<-seq(3,ncol(data13),2)
j<-i-1
cname[i]<-paste(cname[i],data13[1,i])

#We can now remove the first line in the dataset
data13<-filter(data13, X!="")
colnames(data13)<-cname

head(data13)

##     Period        X X2013 Exports X2013 Imports X2014 Exports
## 1  January   Goods        131,228       191,745       133,738
## 2  January Services         56883        37,995        59,141
## 3 February   Goods         132084       193,731       131,768
## 4 February Services        56,828        37,935        57,726
## 5    March   Goods        130,093       186,326       135,923
## 6    March Services        56,647        37,938        58,837
##   X2014 Imports X2015 Exports X2015 Imports
## 1       193,706       129,292       192,242
## 2        38,635        59,733        40,371
## 3       193,060       126,329       184,370
## 4        39,270        59,560        40,057
## 5       198,973       127,183       198,347
## 6        38,908        59,583        40,581

data13<-gather(data13, "Type", "Revenue", 3:ncol(data13))
head(data13)

##     Period        X          Type Revenue
## 1  January   Goods  X2013 Exports 131,228
## 2  January Services X2013 Exports   56883
## 3 February   Goods  X2013 Exports  132084
## 4 February Services X2013 Exports  56,828
## 5    March   Goods  X2013 Exports 130,093
## 6    March Services X2013 Exports  56,647

data13<-data13 %>% separate(Type, c("Year", "Trade"), sep = " ", remove=TRUE)
head(data13)

##     Period        X  Year   Trade Revenue
## 1  January   Goods  X2013 Exports 131,228
## 2  January Services X2013 Exports   56883
## 3 February   Goods  X2013 Exports  132084
## 4 February Services X2013 Exports  56,828
## 5    March   Goods  X2013 Exports 130,093
## 6    March Services X2013 Exports  56,647

#We are then going to remove all the X in the year column
data13$Year<-gsub("X","",data13$Year)
data13$Revenue<-gsub("\\,","",data13$Revenue)
#We change the second column's name
colnames(data13)[2]<-"Type"
data13$Type<-gsub("\\s+","",data13$Type) 
data13<-spread(data13, Trade, Revenue)
head(data13)

##   Period     Type Year Exports Imports
## 1  April    Goods 2013  131468  189507
## 2  April    Goods 2014  135556  199877
## 3  April    Goods 2015  129376  190967
## 4  April Services 2013   56592   38318
## 5  April Services 2014   59468   39417
## 6  April Services 2015   59766   40432

#We divide the dataset in two for analysis. One dataset contains the imports and the other dataset contains the exports 
import_data<-select(data13, Period,Type,Year,Imports)
head(import_data)

##   Period     Type Year Imports
## 1  April    Goods 2013  189507
## 2  April    Goods 2014  199877
## 3  April    Goods 2015  190967
## 4  April Services 2013   38318
## 5  April Services 2014   39417
## 6  April Services 2015   40432

export_data<-select(data13,Period:Exports)
head(export_data)

##   Period     Type Year Exports
## 1  April    Goods 2013  131468
## 2  April    Goods 2014  135556
## 3  April    Goods 2015  129376
## 4  April Services 2013   56592
## 5  April Services 2014   59468
## 6  April Services 2015   59766

The analysis

#1-Calculate the average goods export for each month over the three-year span
exp_mean<-export_data %>% filter(Type == "Goods" ) %>% group_by(Period) %>% summarise(Mean_Exports = mean(as.numeric((Exports))))
exp_mean

## Source: local data frame [4 x 2]
## 
##     Period Mean_Exports
##      (chr)        (dbl)
## 1    April     132133.3
## 2 February     130060.3
## 3  January     131419.3
## 4    March     131066.3

#2- Calculate the average goods import for each month over the three-year span
imp_mean<-import_data %>% filter(Type == "Goods" ) %>% group_by(Period) %>% summarise(Mean_Imports = mean(as.numeric((Imports))))
imp_mean

## Source: local data frame [4 x 2]
## 
##     Period Mean_Imports
##      (chr)        (dbl)
## 1    April     193450.3
## 2 February     190387.0
## 3  January     192564.3
## 4    March     194548.7

#3- Calculate the average services export for each month over the three-year span
exp_mean<-export_data %>% filter(Type == "Services" ) %>% group_by(Period) %>% summarise(Mean_Exports = mean(as.numeric((Exports))))
exp_mean

## Source: local data frame [4 x 2]
## 
##     Period Mean_Exports
##      (chr)        (dbl)
## 1    April     58608.67
## 2 February     58038.00
## 3  January     58585.67
## 4    March     58355.67

#4- Calculate the average services import for each month over the three-year span
imp_mean<-import_data %>% filter(Type == "Services" ) %>% group_by(Period) %>% summarise(Mean_Imports = mean(as.numeric((Imports))))
imp_mean

## Source: local data frame [4 x 2]
## 
##     Period Mean_Imports
##      (chr)        (dbl)
## 1    April     39389.00
## 2 February     39087.33
## 3  January     39000.33
## 4    March     39142.33

Cleansing for the second dataset suggested by Nabila Hossain

head(data2)

##      X Google Facebook Yahoo Microsoft  AOL
## 1 2009   0.36     0.56  1.26      0.37 0.51
## 2 2010   0.86     1.21  1.43      0.51 0.47
## 3 2011   1.67     1.73  1.36      0.60 0.53
## 4 2012   2.26     2.18  1.35      0.90 0.70
## 5 2013   2.99     3.17  1.27      0.79 0.73

#We are using gather to create a Revenue column
data21<-gather(data2, "Company", "Revenue", 2:ncol(data2))
head(data21)

##      X  Company Revenue
## 1 2009   Google    0.36
## 2 2010   Google    0.86
## 3 2011   Google    1.67
## 4 2012   Google    2.26
## 5 2013   Google    2.99
## 6 2009 Facebook    0.56

#we arrange the revenues by year
data22<-arrange(data21, X, Company, Revenue)
head(data22)

##      X   Company Revenue
## 1 2009    Google    0.36
## 2 2009  Facebook    0.56
## 3 2009     Yahoo    1.26
## 4 2009 Microsoft    0.37
## 5 2009       AOL    0.51
## 6 2010    Google    0.86

#We change the first column's name
colnames(data22)[1]<-"Year"
data22$Company<-gsub("\\s+","",data22$Company)

Analysis

yearly <- group_by(data22, Year)
(per_year <- summarise(yearly, Total=sum(Revenue)))

## Source: local data frame [5 x 2]
## 
##    Year Total
##   (int) (dbl)
## 1  2009  3.06
## 2  2010  4.48
## 3  2011  5.89
## 4  2012  7.39
## 5  2013  8.95

data23<-arrange(data22, Company, Year, Revenue)
head(data23)

##   Year  Company Revenue
## 1 2009      AOL    0.51
## 2 2010      AOL    0.47
## 3 2011      AOL    0.53
## 4 2012      AOL    0.70
## 5 2013      AOL    0.73
## 6 2009 Facebook    0.56

#1- Total revenue by year and company 
tran <- group_by(data23, Year,Company)
(tran1 <- summarise(tran, Total=sum(Revenue)))

## Source: local data frame [25 x 3]
## Groups: Year [?]
## 
##     Year   Company Total
##    (int)     (chr) (dbl)
## 1   2009       AOL  0.51
## 2   2009  Facebook  0.56
## 3   2009    Google  0.36
## 4   2009 Microsoft  0.37
## 5   2009     Yahoo  1.26
## 6   2010       AOL  0.47
## 7   2010  Facebook  1.21
## 8   2010    Google  0.86
## 9   2010 Microsoft  0.51
## 10  2010     Yahoo  1.43
## ..   ...       ...   ...

#2- Average revenue by year and company
tran <- group_by(data23, Year, Company)
(tran1 <- summarise(tran, Avg_Revenue=mean(Revenue)))

## Source: local data frame [25 x 3]
## Groups: Year [?]
## 
##     Year   Company Avg_Revenue
##    (int)     (chr)       (dbl)
## 1   2009       AOL        0.51
## 2   2009  Facebook        0.56
## 3   2009    Google        0.36
## 4   2009 Microsoft        0.37
## 5   2009     Yahoo        1.26
## 6   2010       AOL        0.47
## 7   2010  Facebook        1.21
## 8   2010    Google        0.86
## 9   2010 Microsoft        0.51
## 10  2010     Yahoo        1.43
## ..   ...       ...         ...

#3- Compare the revenue of any two or three companies
(data24<-filter(data23, Company=="AOL"| Company == "Google"|Company=="Yahoo"))

##    Year Company Revenue
## 1  2009     AOL    0.51
## 2  2010     AOL    0.47
## 3  2011     AOL    0.53
## 4  2012     AOL    0.70
## 5  2013     AOL    0.73
## 6  2009  Google    0.36
## 7  2010  Google    0.86
## 8  2011  Google    1.67
## 9  2012  Google    2.26
## 10 2013  Google    2.99
## 11 2009   Yahoo    1.26
## 12 2010   Yahoo    1.43
## 13 2011   Yahoo    1.36
## 14 2012   Yahoo    1.35
## 15 2013   Yahoo    1.27

qplot(Year,Revenue, data=data24, color=Company)

Cleansing for the third dataset suggested by myself

head(data3)

##            Poll        Date Trump Carson Fiorina Rubio Bush Cruz Kasich
## 1       PPP (D) 10/1 - 10/4    27     17       6    13   10    7      4
## 2      IBD/TIPP 9/26 - 10/1    17     24       9    11    8    6      4
## 3  USAT/Suffolk 9/24 - 9/28    23     13      13     9    8    6      2
## 4 Pew Research* 9/22 - 9/27    25     16       8     8    4    6      1
## 5       NBC/WSJ 9/20 - 9/24    21     20      11    11    7    5      6
## 6      FOX News 9/20 - 9/22    26     18       9     9    7    8      4
##   Huckabee Christie Paul Santorum Jindal Graham Pataki
## 1        4        2    2        2      1      1      1
## 2        2        2    3        0      1      0      0
## 3        2        1    2        0      1      1      0
## 4        2        1    2        0      0      1      0
## 5        2        3    3        1      1      0      0
## 6        3        5    2        0      0      0      1

#We are creating a column called Score with gather
data31<-gather(data3, "Candidate", "Score", 3:ncol(data3))
head(data31)

##            Poll        Date Candidate Score
## 1       PPP (D) 10/1 - 10/4     Trump    27
## 2      IBD/TIPP 9/26 - 10/1     Trump    17
## 3  USAT/Suffolk 9/24 - 9/28     Trump    23
## 4 Pew Research* 9/22 - 9/27     Trump    25
## 5       NBC/WSJ 9/20 - 9/24     Trump    21
## 6      FOX News 9/20 - 9/22     Trump    26

Analysis

perf <- group_by(data31, Candidate)
(perf1 <- summarise(perf, Avg_Points=round(mean(Score),2)))

## Source: local data frame [14 x 2]
## 
##    Candidate Avg_Points
##       (fctr)      (dbl)
## 1      Trump      25.22
## 2     Carson      16.17
## 3    Fiorina       7.72
## 4      Rubio       8.11
## 5       Bush       8.61
## 6       Cruz       6.56
## 7     Kasich       3.50
## 8   Huckabee       3.61
## 9   Christie       2.50
## 10      Paul       2.78
## 11  Santorum       0.78
## 12    Jindal       0.50
## 13    Graham       0.28
## 14    Pataki       0.22

perf <- group_by(data31, Candidate, Poll)
(perf1 <- summarise(perf, Avg_Points=round(mean(Score),2)))

## Source: local data frame [182 x 3]
## Groups: Candidate [?]
## 
##    Candidate          Poll Avg_Points
##       (fctr)         (chr)      (dbl)
## 1      Trump ABC/Wash Post      33.00
## 2      Trump     Bloomberg      21.00
## 3      Trump       CBS/NYT      27.00
## 4      Trump       CNN/ORC      26.67
## 5      Trump      FOX News      25.50
## 6      Trump      IBD/TIPP      17.00
## 7      Trump      Monmouth      30.00
## 8      Trump       NBC/WSJ      21.00
## 9      Trump Pew Research*      25.00
## 10     Trump       PPP (D)      28.00
## ..       ...           ...        ...

qplot(Score,Candidate, data=data31, color=Poll)

Project 2

Edwige

October 11, 2015

Cleansing steps for the trade dataset from Jason Joseph

The analysis

Cleansing for the second dataset suggested by Nabila Hossain

Analysis

Cleansing for the third dataset suggested by myself

Analysis