Project 2. Read 3 datasets and transform them from wide to long format and perform analysis.

library (curl)
## Warning: package 'curl' was built under R version 3.2.2
library (ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.2
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.2
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
setwd("C://users//angus//Documents//IS607//Project2")

read in first dataset

hr<- read.csv("C://users//angus//Documents//IS607//Project2//hr.csv")

remove the extra subtotal

hr2<-subset(hr,Role!='ALL')
hr2
##              Role Location Current_HeadCount Q1_2015 Q2_2015 Q3_2015
## 1  Administration       US                33      39      47      55
## 2  Administration       DZ                33      39      47      55
## 3  Administration       GF                34      40      48      57
## 4           Sales       US                66      78      93     111
## 5           Sales       DZ                66      78      93     111
## 6           Sales       GF                67      80      95     113
## 7         Product       US                66      78      93     111
## 8         Product       DZ                66      78      93     111
## 9         Product       GF                67      80      95     113
## 10    Engineering       US               100     119     141     168
## 11    Engineering       DZ               100     119     141     168
## 12    Engineering       GF               100     119     141     168
## 13       Services       US                17      20      24      29
## 14       Services       DZ                17      20      24      29
## 15       Services       GF                18      21      25      30
## 16     Operations       US                50      59      71      84
## 17     Operations       DZ                50      59      71      84
## 18     Operations       GF                50      59      71      84
##    Q4_2015
## 1       66
## 2       66
## 3       68
## 4      132
## 5      132
## 6      134
## 7      132
## 8      132
## 9      134
## 10     200
## 11     200
## 12     200
## 13      34
## 14      34
## 15      36
## 16     100
## 17     100
## 18     100

Read in second data set

grade<-read.csv("C://users//angus//Documents//IS607//Project2//schoolgrade.csv")
grade
##           Student English Math Science Social.Studies Technology
## 1     Alicia Cruz      46   47      94             96         84
## 2   Solomon Tunde      64   84      43             60         83
## 3   Daniel Zainea      47   82      52             58         96
## 4    Thomas Brown      96   50      58             71        100
## 5   Lisa Giamanco      40   76      50             85         78
## 6  Cheryl Haridat      75   78      98             41         88
## 7   Winston Cadiz      90   73      90             75         45
## 8    Pablo Romero      91   50      85             68         72
## 9   Jennifer WeiI      98   63      43             58         40
## 10 Ronald Rosario      98   40      98             75         81
## 11  Martha carpio      69   60      52             60         47
## 12 Jasmine carpio      99   86      49             89         93
## 13    Raul Martin      42   59      54             48         80
## 14    Desio Trump      41   42      69             92         55
##    Days.Present
## 1           169
## 2           121
## 3           146
## 4            95
## 5           118
## 6           159
## 7           175
## 8           101
## 9           107
## 10           94
## 11          171
## 12          140
## 13          174
## 14          167
grade2 <-grade

Correct the names of the vairables

names(grade2)[5] <- "Social_Studies"
names(grade2)[7] <- "Days_Present"
grade2
##           Student English Math Science Social_Studies Technology
## 1     Alicia Cruz      46   47      94             96         84
## 2   Solomon Tunde      64   84      43             60         83
## 3   Daniel Zainea      47   82      52             58         96
## 4    Thomas Brown      96   50      58             71        100
## 5   Lisa Giamanco      40   76      50             85         78
## 6  Cheryl Haridat      75   78      98             41         88
## 7   Winston Cadiz      90   73      90             75         45
## 8    Pablo Romero      91   50      85             68         72
## 9   Jennifer WeiI      98   63      43             58         40
## 10 Ronald Rosario      98   40      98             75         81
## 11  Martha carpio      69   60      52             60         47
## 12 Jasmine carpio      99   86      49             89         93
## 13    Raul Martin      42   59      54             48         80
## 14    Desio Trump      41   42      69             92         55
##    Days_Present
## 1           169
## 2           121
## 3           146
## 4            95
## 5           118
## 6           159
## 7           175
## 8           101
## 9           107
## 10           94
## 11          171
## 12          140
## 13          174
## 14          167

read the third long data set

trade <- read.csv("C://users//angus//Documents//IS607//Project2//trade.csv")
trade2<- trade

remove commas from numeric values and correct variables names

trade2[,] <-lapply(trade2[,],function(trade2){gsub(",","",trade2)} )
trade2
##      Period        X   X2013     X.1   X2014     X.2   X2015     X.3
## 1                    Exports Imports Exports Imports Exports Imports
## 2                                                                   
## 3   January   Goods   131228  191745  133738  193706  129292  192242
## 4           Services   56883   37995   59141   38635   59733   40371
## 5                                                                   
## 6  February   Goods   132084  193731  131768  193060  126329  184370
## 7           Services   56828   37935   57726   39270   59560   40057
## 8                                                                   
## 9     March   Goods   130093  186326  135923  198973  127183  198347
## 10          Services   56647   37938   58837   38908   59583   40581
## 11                                                                  
## 12    April   Goods   131468  189507  135556  199877  129376  190967
## 13          Services   56592   38318   59468   39417   59766   40432
names(trade2)[2] <- "Type"
names(trade2)[3:8] <- c("2013:e","2013:i","2014:e","2014:i","2015:e","2015:i")

Convert wide from to long form and filled in the blank with correct data type

trade3 <-trade2[2:13,]
trade3 <-trade3[complete.cases(trade3),]
#remove blank rows
trade3 <- trade3[!(is.na(trade3$`2015:i`) |trade3$`2015:i` ==""),]
trade3[2,1] <- "January"
trade3[4,1] <- "February"
trade3[6,1] <- "March"
trade3[8,1] <- "April"
head(trade3)
##      Period     Type 2013:e 2013:i 2014:e 2014:i 2015:e 2015:i
## 3   January   Goods  131228 191745 133738 193706 129292 192242
## 4   January Services  56883  37995  59141  38635  59733  40371
## 6  February   Goods  132084 193731 131768 193060 126329 184370
## 7  February Services  56828  37935  57726  39270  59560  40057
## 9     March   Goods  130093 186326 135923 198973 127183 198347
## 10    March Services  56647  37938  58837  38908  59583  40581

gather data into tabular format, split the column data into separate column

trade4 <- trade3 %>% gather (year,amount,3:8)
head(trade4)
##     Period     Type   year amount
## 1  January   Goods  2013:e 131228
## 2  January Services 2013:e  56883
## 3 February   Goods  2013:e 132084
## 4 February Services 2013:e  56828
## 5    March   Goods  2013:e 130093
## 6    March Services 2013:e  56647
trade4$tradecat<-sub(".*:","",trade4$year)
trade4$year <-sub(":.*","",trade4$year)
head(trade4)
##     Period     Type year amount tradecat
## 1  January   Goods  2013 131228        e
## 2  January Services 2013  56883        e
## 3 February   Goods  2013 132084        e
## 4 February Services 2013  56828        e
## 5    March   Goods  2013 130093        e
## 6    March Services 2013  56647        e

now spread the data & perform analysis.

Conclusion: The final table shows that American companies have strong export in service industry.

trade5 <-trade4%>% spread(tradecat,amount)
head(trade5)
##   Period     Type year      e      i
## 1  April   Goods  2013 131468 189507
## 2  April   Goods  2014 135556 199877
## 3  April   Goods  2015 129376 190967
## 4  April Services 2013  56592  38318
## 5  April Services 2014  59468  39417
## 6  April Services 2015  59766  40432
trade5 <-trade5[order(trade5$Type),]
tbl_df(trade5)
## Source: local data frame [24 x 5]
## 
##      Period   Type year      e      i
## 1     April Goods  2013 131468 189507
## 2     April Goods  2014 135556 199877
## 3     April Goods  2015 129376 190967
## 4  February Goods  2013 132084 193731
## 5  February Goods  2014 131768 193060
## 6  February Goods  2015 126329 184370
## 7   January Goods  2013 131228 191745
## 8   January Goods  2014 133738 193706
## 9   January Goods  2015 129292 192242
## 10    March Goods  2013 130093 186326
## ..      ...    ...  ...    ...    ...
trade5$e <- as.numeric(trade5$e)
trade5$i <- as.numeric(trade5$i)
trade5$exportratio <-(trade5$e/trade5$i)
trade5$exportratio <-(trade5$e/trade5$i)*100
names(trade5)[4:5]<-c("export","import")
trade5
##      Period     Type year export import exportratio
## 1     April   Goods  2013 131468 189507    69.37369
## 2     April   Goods  2014 135556 199877    67.81971
## 3     April   Goods  2015 129376 190967    67.74783
## 7  February   Goods  2013 132084 193731    68.17907
## 8  February   Goods  2014 131768 193060    68.25236
## 9  February   Goods  2015 126329 184370    68.51928
## 13  January   Goods  2013 131228 191745    68.43881
## 14  January   Goods  2014 133738 193706    69.04174
## 15  January   Goods  2015 129292 192242    67.25481
## 19    March   Goods  2013 130093 186326    69.82010
## 20    March   Goods  2014 135923 198973    68.31228
## 21    March   Goods  2015 127183 198347    64.12146
## 4     April Services 2013  56592  38318   147.69038
## 5     April Services 2014  59468  39417   150.86891
## 6     April Services 2015  59766  40432   147.81856
## 10 February Services 2013  56828  37935   149.80361
## 11 February Services 2014  57726  39270   146.99771
## 12 February Services 2015  59560  40057   148.68812
## 16  January Services 2013  56883  37995   149.71180
## 17  January Services 2014  59141  38635   153.07623
## 18  January Services 2015  59733  40371   147.96017
## 22    March Services 2013  56647  37938   149.31467
## 23    March Services 2014  58837  38908   151.22083
## 24    March Services 2015  59583  40581   146.82487

perform analysis on second dataset

library(ggplot2)
hr2
##              Role Location Current_HeadCount Q1_2015 Q2_2015 Q3_2015
## 1  Administration       US                33      39      47      55
## 2  Administration       DZ                33      39      47      55
## 3  Administration       GF                34      40      48      57
## 4           Sales       US                66      78      93     111
## 5           Sales       DZ                66      78      93     111
## 6           Sales       GF                67      80      95     113
## 7         Product       US                66      78      93     111
## 8         Product       DZ                66      78      93     111
## 9         Product       GF                67      80      95     113
## 10    Engineering       US               100     119     141     168
## 11    Engineering       DZ               100     119     141     168
## 12    Engineering       GF               100     119     141     168
## 13       Services       US                17      20      24      29
## 14       Services       DZ                17      20      24      29
## 15       Services       GF                18      21      25      30
## 16     Operations       US                50      59      71      84
## 17     Operations       DZ                50      59      71      84
## 18     Operations       GF                50      59      71      84
##    Q4_2015
## 1       66
## 2       66
## 3       68
## 4      132
## 5      132
## 6      134
## 7      132
## 8      132
## 9      134
## 10     200
## 11     200
## 12     200
## 13      34
## 14      34
## 15      36
## 16     100
## 17     100
## 18     100
hr3<-hr2[,1:3]
hr4<-hr3 %>%spread(Location,Current_HeadCount)
hr4$total <- hr4$DZ+hr4$GF+hr4$US
hr4
##             Role  DZ  GF  US total
## 1 Administration  33  34  33   100
## 2    Engineering 100 100 100   300
## 3     Operations  50  50  50   150
## 4        Product  66  67  66   199
## 5          Sales  66  67  66   199
## 6       Services  17  18  17    52
ggplot(data=hr4, aes(x=hr4$Role, y=hr4$total)) + geom_bar(stat="identity", position = "dodge")

perform analysis on third dataset and perform correlation test between days present and average grade scores

grade2
##           Student English Math Science Social_Studies Technology
## 1     Alicia Cruz      46   47      94             96         84
## 2   Solomon Tunde      64   84      43             60         83
## 3   Daniel Zainea      47   82      52             58         96
## 4    Thomas Brown      96   50      58             71        100
## 5   Lisa Giamanco      40   76      50             85         78
## 6  Cheryl Haridat      75   78      98             41         88
## 7   Winston Cadiz      90   73      90             75         45
## 8    Pablo Romero      91   50      85             68         72
## 9   Jennifer WeiI      98   63      43             58         40
## 10 Ronald Rosario      98   40      98             75         81
## 11  Martha carpio      69   60      52             60         47
## 12 Jasmine carpio      99   86      49             89         93
## 13    Raul Martin      42   59      54             48         80
## 14    Desio Trump      41   42      69             92         55
##    Days_Present
## 1           169
## 2           121
## 3           146
## 4            95
## 5           118
## 6           159
## 7           175
## 8           101
## 9           107
## 10           94
## 11          171
## 12          140
## 13          174
## 14          167
grade2$avg<-(grade2$English +grade2$Math+grade2$Science+grade2$Social_Studies+grade2$Technology)/5
grade2
##           Student English Math Science Social_Studies Technology
## 1     Alicia Cruz      46   47      94             96         84
## 2   Solomon Tunde      64   84      43             60         83
## 3   Daniel Zainea      47   82      52             58         96
## 4    Thomas Brown      96   50      58             71        100
## 5   Lisa Giamanco      40   76      50             85         78
## 6  Cheryl Haridat      75   78      98             41         88
## 7   Winston Cadiz      90   73      90             75         45
## 8    Pablo Romero      91   50      85             68         72
## 9   Jennifer WeiI      98   63      43             58         40
## 10 Ronald Rosario      98   40      98             75         81
## 11  Martha carpio      69   60      52             60         47
## 12 Jasmine carpio      99   86      49             89         93
## 13    Raul Martin      42   59      54             48         80
## 14    Desio Trump      41   42      69             92         55
##    Days_Present  avg
## 1           169 73.4
## 2           121 66.8
## 3           146 67.0
## 4            95 75.0
## 5           118 65.8
## 6           159 76.0
## 7           175 74.6
## 8           101 73.2
## 9           107 60.4
## 10           94 78.4
## 11          171 57.6
## 12          140 83.2
## 13          174 56.6
## 14          167 59.8
cor(grade2$Days_Present, grade2$avg)
## [1] -0.3066169