# Get the packages we need.
require(tidyr)
require(dplyr)
require(knitr)
require(stringr)
require(ggplot2)

Data Set 1 - School Grades - By Puneet Auluck (Myself)

# Read csv file
grades <- read.csv(file = "C:\\project2\\grades.csv", stringsAsFactors = FALSE)
kable(head(grades))
Student English Math Science Social.Studies Technology Days.Present
Alicia Cruz 46 47 94 96 84 169
Solomon Tunde 64 84 43 60 83 121
Daniel Zainea 47 82 52 58 96 146
Thomas Brown 96 50 58 71 100 95
Lisa Giamanco 40 76 50 85 78 118
Cheryl Haridat 75 78 98 41 88 159

1. Is there a correlation between how well a student did in the class and their school attendance?

 The original grades.csv wide format is made into long by gathering Subject and corresponding Grade.

grades_l <- grades %>% gather("Subject", "Grades", 2:6)
kable(head(grades_l %>% arrange(desc(Days.Present))))
Student Days.Present Subject Grades
Winston Cadiz 175 English 90
Winston Cadiz 175 Math 73
Winston Cadiz 175 Science 90
Winston Cadiz 175 Social.Studies 75
Winston Cadiz 175 Technology 45
Raul Martin 174 English 42

  Let’s find grade average for each student.

grades_avg <- grades_l %>% group_by(Student,Days.Present) %>%
              summarise(Avg=mean(Grades))
              
kable(grades_avg %>% arrange(desc(Days.Present)))
Student Days.Present Avg
Alicia Cruz 169 73.4
Cheryl Haridat 159 76.0
Daniel Zainea 146 67.0
Desio Trump 167 59.8
Jasmine Carpio 140 83.2
Jennifer Weil 107 60.4
Lisa Giamanco 118 65.8
Martha Carpio 171 57.6
Pablo Romero 101 73.2
Raul Martin 174 56.6
Ronald Rosario 94 78.4
Solomon Tunde 121 66.8
Thomas Brown 95 75.0
Winston Cadiz 175 74.6

 Student, Winston Cadiz, with highest attendance has grade average of 74.6 and student, Ronald Rosario, with lowest attendance has 78.4. Based on grades.csv table, there no indication of a positive correlation between the days a student is present in school and how well he/she does academically.

 Here is a graph to visually represent the conclusion.

grades_plot <- ggplot(grades_avg, aes(y=Avg, x=Days.Present)) + geom_point(shape=1)
grades_plot + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

2. How are the grades by subject?

grades_plot2 <- ggplot(data= grades_l, aes(y=Grades, x=Days.Present, fill=factor(Subject))) + geom_point() 
grades_plot2 + geom_smooth() + facet_wrap(~Subject)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

  It appears, aside from Science and Technology, the grades are affected by the number of days a student is present in class.

Data Set 2 - Living Former Presidents - By Youqing Xiang

# Read csv file
presidents <- read.csv(file = "C:\\project2\\presidents.csv", stringsAsFactors = FALSE)
kable(presidents)
President OfficeTerm BirthDate
George H. W. Bush 1989-1993 June 12,1994 (age 91)
Jimmy Carter 1977-1981 October 1, 1924 (age 90)
George W. Bush 2001-2009 July 6, 1946 (age 69)
Bill Clinton 1993-2001 August 19, 1946 (age 69)

 Tidy data

presidents_tidy <- presidents %>%
                   separate(OfficeTerm, into=c("OfficeTermStart", "OfficeTermEnd"), sep="-") %>%
                   mutate(OfficeYears = as.numeric(OfficeTermEnd)-as.numeric(OfficeTermStart)) %>%
                   separate(BirthDate, into=c("BirthDate", "Age"), sep="\\(age ")
presidents_tidy$Age <- str_replace(presidents_tidy$Age, "\\)$","")
kable(presidents_tidy)
President OfficeTermStart OfficeTermEnd BirthDate Age OfficeYears
George H. W. Bush 1989 1993 June 12,1994 91 4
Jimmy Carter 1977 1981 October 1, 1924 90 4
George W. Bush 2001 2009 July 6, 1946 69 8
Bill Clinton 1993 2001 August 19, 1946 69 8

1. Give the list of presidents who had one term(4 years in the office) and presidents who had two terms(8 years in the office)

kable(presidents_tidy[presidents_tidy$OfficeYears == 4,])
President OfficeTermStart OfficeTermEnd BirthDate Age OfficeYears
George H. W. Bush 1989 1993 June 12,1994 91 4
Jimmy Carter 1977 1981 October 1, 1924 90 4
kable(presidents_tidy[presidents_tidy$OfficeYears == 8,])
President OfficeTermStart OfficeTermEnd BirthDate Age OfficeYears
3 George W. Bush 2001 2009 July 6, 1946 69 8
4 Bill Clinton 1993 2001 August 19, 1946 69 8

2. Which president is the oldest one?

kable(presidents_tidy %>% select(President,Age) %>% filter(Age == max(presidents_tidy$Age)))
President Age
George H. W. Bush 91

3. What is the average age of all the presidents?

mean(as.numeric(as.character(presidents_tidy$Age)))
## [1] 79.75

Data Set: 3 - U.S. International Trade - By Jason Joseph

# Read csv file
trade <- read.csv(file = "C:\\project2\\trades.csv", stringsAsFactors = FALSE)
colnames(trade) <-c("Period","Type","2013-Exports","2013-Imports","2014-Exports","2014-Imports","2015-Exports","2015-Imports")
kable(trade)
Period Type 2013-Exports 2013-Imports 2014-Exports 2014-Imports 2015-Exports 2015-Imports
Exports Imports Exports Imports Exports Imports

January Goods 131,228 191,745 133,738 193,706 129,292 192,242
Services 56883 37,995 59,141 38,635 59,733 40,371

February Goods 132084 193,731 131,768 193,060 126,329 184,370
Services 56,828 37,935 57,726 39,270 59,560 40,057

March Goods 130,093 186,326 135,923 198,973 127,183 198,347
Services 56,647 37,938 58,837 38,908 59,583 40,581

April Goods 131,468 189,507 135,556 199,877 129,376 190,967
Services 56,592 38,318 59,468 39,417 59,766 40,432


 Drop unwanted rows and add missing data in first column.

trade[c(seq(4, 13, 3)), 1] <- trade[c(seq(3,12,3)), 1] 
trade <- trade[c( -1, -2,c(seq(-11, -5, 3))),] #drop unwanted rows
kable(trade)
Period Type 2013-Exports 2013-Imports 2014-Exports 2014-Imports 2015-Exports 2015-Imports
3 January Goods 131,228 191,745 133,738 193,706 129,292 192,242
4 January Services 56883 37,995 59,141 38,635 59,733 40,371
6 February Goods 132084 193,731 131,768 193,060 126,329 184,370
7 February Services 56,828 37,935 57,726 39,270 59,560 40,057
9 March Goods 130,093 186,326 135,923 198,973 127,183 198,347
10 March Services 56,647 37,938 58,837 38,908 59,583 40,581
12 April Goods 131,468 189,507 135,556 199,877 129,376 190,967
13 April Services 56,592 38,318 59,468 39,417 59,766 40,432


 Tidy data

trade_tidy <- trade %>% gather("TradeType", "Trades", 3:8) %>%
              separate(TradeType, into=c("Year","TradeType"), sep="-")

trade_tidy$Trades <- as.numeric(gsub(",", "", as.character(trade_tidy$Trades)))
trade_tidy$Type <- str_trim(trade_tidy$Type)
kable(trade_tidy)
Period Type Year TradeType Trades
January Goods 2013 Exports 131228
January Services 2013 Exports 56883
February Goods 2013 Exports 132084
February Services 2013 Exports 56828
March Goods 2013 Exports 130093
March Services 2013 Exports 56647
April Goods 2013 Exports 131468
April Services 2013 Exports 56592
January Goods 2013 Imports 191745
January Services 2013 Imports 37995
February Goods 2013 Imports 193731
February Services 2013 Imports 37935
March Goods 2013 Imports 186326
March Services 2013 Imports 37938
April Goods 2013 Imports 189507
April Services 2013 Imports 38318
January Goods 2014 Exports 133738
January Services 2014 Exports 59141
February Goods 2014 Exports 131768
February Services 2014 Exports 57726
March Goods 2014 Exports 135923
March Services 2014 Exports 58837
April Goods 2014 Exports 135556
April Services 2014 Exports 59468
January Goods 2014 Imports 193706
January Services 2014 Imports 38635
February Goods 2014 Imports 193060
February Services 2014 Imports 39270
March Goods 2014 Imports 198973
March Services 2014 Imports 38908
April Goods 2014 Imports 199877
April Services 2014 Imports 39417
January Goods 2015 Exports 129292
January Services 2015 Exports 59733
February Goods 2015 Exports 126329
February Services 2015 Exports 59560
March Goods 2015 Exports 127183
March Services 2015 Exports 59583
April Goods 2015 Exports 129376
April Services 2015 Exports 59766
January Goods 2015 Imports 192242
January Services 2015 Imports 40371
February Goods 2015 Imports 184370
February Services 2015 Imports 40057
March Goods 2015 Imports 198347
March Services 2015 Imports 40581
April Goods 2015 Imports 190967
April Services 2015 Imports 40432


 Summarise the data

trade_summary <- trade_tidy %>% group_by(Period, Type, TradeType) %>% summarise(Avg=mean(Trades)) %>% arrange(desc(Avg))
kable(trade_summary)
Period Type TradeType Avg
April Goods Imports 193450.33
April Goods Exports 132133.33
April Services Exports 58608.67
April Services Imports 39389.00
February Goods Imports 190387.00
February Goods Exports 130060.33
February Services Exports 58038.00
February Services Imports 39087.33
January Goods Imports 192564.33
January Goods Exports 131419.33
January Services Exports 58585.67
January Services Imports 39000.33
March Goods Imports 194548.67
March Goods Exports 131066.33
March Services Exports 58355.67
March Services Imports 39142.33
ggplot(trade_summary, aes(x=TradeType, y=Avg, fill=TradeType)) + geom_bar(stat = "identity") + facet_grid(Type ~ Period) + labs(x = "Export/Import", y = "Average") + ggtitle("Three Year(2013-2014) Average Span")

 From examining the plot above, it shows more services are exported than goods. However number of goods exported and imported are much higher than the services.

1. Calculate the average goods export for each month over the three-year span.

kable(trade_summary %>% filter(Type == "Goods" & TradeType == "Exports"))
Period Type TradeType Avg
April Goods Exports 132133.3
February Goods Exports 130060.3
January Goods Exports 131419.3
March Goods Exports 131066.3

2. Calculate the average goods import for each month over the three-year span.

kable(trade_summary %>% filter(Type == "Goods" & TradeType == "Imports"))
Period Type TradeType Avg
April Goods Imports 193450.3
February Goods Imports 190387.0
January Goods Imports 192564.3
March Goods Imports 194548.7

3. Calculate the average services export for each month over the three-year span.

kable(trade_summary %>% filter(Type == "Services" & TradeType == "Exports"))
Period Type TradeType Avg
April Services Exports 58608.67
February Services Exports 58038.00
January Services Exports 58585.67
March Services Exports 58355.67

4. Calculate the average services import for each month over the three-year span.

kable(trade_summary %>% filter(Type == "Services" & TradeType == "Imports"))
Period Type TradeType Avg
April Services Imports 39389.00
February Services Imports 39087.33
January Services Imports 39000.33
March Services Imports 39142.33