# Get the packages we need.
require(tidyr)
require(dplyr)
require(knitr)
require(stringr)
require(ggplot2)
# Read csv file
grades <- read.csv(file = "C:\\project2\\grades.csv", stringsAsFactors = FALSE)
kable(head(grades))
| Student | English | Math | Science | Social.Studies | Technology | Days.Present |
|---|---|---|---|---|---|---|
| Alicia Cruz | 46 | 47 | 94 | 96 | 84 | 169 |
| Solomon Tunde | 64 | 84 | 43 | 60 | 83 | 121 |
| Daniel Zainea | 47 | 82 | 52 | 58 | 96 | 146 |
| Thomas Brown | 96 | 50 | 58 | 71 | 100 | 95 |
| Lisa Giamanco | 40 | 76 | 50 | 85 | 78 | 118 |
| Cheryl Haridat | 75 | 78 | 98 | 41 | 88 | 159 |
The original grades.csv wide format is made into long by gathering Subject and corresponding Grade.
grades_l <- grades %>% gather("Subject", "Grades", 2:6)
kable(head(grades_l %>% arrange(desc(Days.Present))))
| Student | Days.Present | Subject | Grades |
|---|---|---|---|
| Winston Cadiz | 175 | English | 90 |
| Winston Cadiz | 175 | Math | 73 |
| Winston Cadiz | 175 | Science | 90 |
| Winston Cadiz | 175 | Social.Studies | 75 |
| Winston Cadiz | 175 | Technology | 45 |
| Raul Martin | 174 | English | 42 |
Let’s find grade average for each student.
grades_avg <- grades_l %>% group_by(Student,Days.Present) %>%
summarise(Avg=mean(Grades))
kable(grades_avg %>% arrange(desc(Days.Present)))
| Student | Days.Present | Avg |
|---|---|---|
| Alicia Cruz | 169 | 73.4 |
| Cheryl Haridat | 159 | 76.0 |
| Daniel Zainea | 146 | 67.0 |
| Desio Trump | 167 | 59.8 |
| Jasmine Carpio | 140 | 83.2 |
| Jennifer Weil | 107 | 60.4 |
| Lisa Giamanco | 118 | 65.8 |
| Martha Carpio | 171 | 57.6 |
| Pablo Romero | 101 | 73.2 |
| Raul Martin | 174 | 56.6 |
| Ronald Rosario | 94 | 78.4 |
| Solomon Tunde | 121 | 66.8 |
| Thomas Brown | 95 | 75.0 |
| Winston Cadiz | 175 | 74.6 |
Student, Winston Cadiz, with highest attendance has grade average of 74.6 and student, Ronald Rosario, with lowest attendance has 78.4. Based on grades.csv table, there no indication of a positive correlation between the days a student is present in school and how well he/she does academically.
Here is a graph to visually represent the conclusion.
grades_plot <- ggplot(grades_avg, aes(y=Avg, x=Days.Present)) + geom_point(shape=1)
grades_plot + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
grades_plot2 <- ggplot(data= grades_l, aes(y=Grades, x=Days.Present, fill=factor(Subject))) + geom_point()
grades_plot2 + geom_smooth() + facet_wrap(~Subject)
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# Read csv file
presidents <- read.csv(file = "C:\\project2\\presidents.csv", stringsAsFactors = FALSE)
kable(presidents)
| President | OfficeTerm | BirthDate |
|---|---|---|
| George H. W. Bush | 1989-1993 | June 12,1994 (age 91) |
| Jimmy Carter | 1977-1981 | October 1, 1924 (age 90) |
| George W. Bush | 2001-2009 | July 6, 1946 (age 69) |
| Bill Clinton | 1993-2001 | August 19, 1946 (age 69) |
Tidy data
presidents_tidy <- presidents %>%
separate(OfficeTerm, into=c("OfficeTermStart", "OfficeTermEnd"), sep="-") %>%
mutate(OfficeYears = as.numeric(OfficeTermEnd)-as.numeric(OfficeTermStart)) %>%
separate(BirthDate, into=c("BirthDate", "Age"), sep="\\(age ")
presidents_tidy$Age <- str_replace(presidents_tidy$Age, "\\)$","")
kable(presidents_tidy)
| President | OfficeTermStart | OfficeTermEnd | BirthDate | Age | OfficeYears |
|---|---|---|---|---|---|
| George H. W. Bush | 1989 | 1993 | June 12,1994 | 91 | 4 |
| Jimmy Carter | 1977 | 1981 | October 1, 1924 | 90 | 4 |
| George W. Bush | 2001 | 2009 | July 6, 1946 | 69 | 8 |
| Bill Clinton | 1993 | 2001 | August 19, 1946 | 69 | 8 |
kable(presidents_tidy[presidents_tidy$OfficeYears == 4,])
| President | OfficeTermStart | OfficeTermEnd | BirthDate | Age | OfficeYears |
|---|---|---|---|---|---|
| George H. W. Bush | 1989 | 1993 | June 12,1994 | 91 | 4 |
| Jimmy Carter | 1977 | 1981 | October 1, 1924 | 90 | 4 |
kable(presidents_tidy[presidents_tidy$OfficeYears == 8,])
| President | OfficeTermStart | OfficeTermEnd | BirthDate | Age | OfficeYears | |
|---|---|---|---|---|---|---|
| 3 | George W. Bush | 2001 | 2009 | July 6, 1946 | 69 | 8 |
| 4 | Bill Clinton | 1993 | 2001 | August 19, 1946 | 69 | 8 |
kable(presidents_tidy %>% select(President,Age) %>% filter(Age == max(presidents_tidy$Age)))
| President | Age |
|---|---|
| George H. W. Bush | 91 |
mean(as.numeric(as.character(presidents_tidy$Age)))
## [1] 79.75
# Read csv file
trade <- read.csv(file = "C:\\project2\\trades.csv", stringsAsFactors = FALSE)
colnames(trade) <-c("Period","Type","2013-Exports","2013-Imports","2014-Exports","2014-Imports","2015-Exports","2015-Imports")
kable(trade)
| Period | Type | 2013-Exports | 2013-Imports | 2014-Exports | 2014-Imports | 2015-Exports | 2015-Imports |
|---|---|---|---|---|---|---|---|
| Exports | Imports | Exports | Imports | Exports | Imports |
January Goods 131,228 191,745 133,738 193,706 129,292 192,242
Services 56883 37,995 59,141 38,635 59,733 40,371
February Goods 132084 193,731 131,768 193,060 126,329 184,370
Services 56,828 37,935 57,726 39,270 59,560 40,057
March Goods 130,093 186,326 135,923 198,973 127,183 198,347
Services 56,647 37,938 58,837 38,908 59,583 40,581
April Goods 131,468 189,507 135,556 199,877 129,376 190,967
Services 56,592 38,318 59,468 39,417 59,766 40,432
Drop unwanted rows and add missing data in first column.
trade[c(seq(4, 13, 3)), 1] <- trade[c(seq(3,12,3)), 1]
trade <- trade[c( -1, -2,c(seq(-11, -5, 3))),] #drop unwanted rows
kable(trade)
| Period | Type | 2013-Exports | 2013-Imports | 2014-Exports | 2014-Imports | 2015-Exports | 2015-Imports | |
|---|---|---|---|---|---|---|---|---|
| 3 | January | Goods | 131,228 | 191,745 | 133,738 | 193,706 | 129,292 | 192,242 |
| 4 | January | Services | 56883 | 37,995 | 59,141 | 38,635 | 59,733 | 40,371 |
| 6 | February | Goods | 132084 | 193,731 | 131,768 | 193,060 | 126,329 | 184,370 |
| 7 | February | Services | 56,828 | 37,935 | 57,726 | 39,270 | 59,560 | 40,057 |
| 9 | March | Goods | 130,093 | 186,326 | 135,923 | 198,973 | 127,183 | 198,347 |
| 10 | March | Services | 56,647 | 37,938 | 58,837 | 38,908 | 59,583 | 40,581 |
| 12 | April | Goods | 131,468 | 189,507 | 135,556 | 199,877 | 129,376 | 190,967 |
| 13 | April | Services | 56,592 | 38,318 | 59,468 | 39,417 | 59,766 | 40,432 |
Tidy data
trade_tidy <- trade %>% gather("TradeType", "Trades", 3:8) %>%
separate(TradeType, into=c("Year","TradeType"), sep="-")
trade_tidy$Trades <- as.numeric(gsub(",", "", as.character(trade_tidy$Trades)))
trade_tidy$Type <- str_trim(trade_tidy$Type)
kable(trade_tidy)
| Period | Type | Year | TradeType | Trades |
|---|---|---|---|---|
| January | Goods | 2013 | Exports | 131228 |
| January | Services | 2013 | Exports | 56883 |
| February | Goods | 2013 | Exports | 132084 |
| February | Services | 2013 | Exports | 56828 |
| March | Goods | 2013 | Exports | 130093 |
| March | Services | 2013 | Exports | 56647 |
| April | Goods | 2013 | Exports | 131468 |
| April | Services | 2013 | Exports | 56592 |
| January | Goods | 2013 | Imports | 191745 |
| January | Services | 2013 | Imports | 37995 |
| February | Goods | 2013 | Imports | 193731 |
| February | Services | 2013 | Imports | 37935 |
| March | Goods | 2013 | Imports | 186326 |
| March | Services | 2013 | Imports | 37938 |
| April | Goods | 2013 | Imports | 189507 |
| April | Services | 2013 | Imports | 38318 |
| January | Goods | 2014 | Exports | 133738 |
| January | Services | 2014 | Exports | 59141 |
| February | Goods | 2014 | Exports | 131768 |
| February | Services | 2014 | Exports | 57726 |
| March | Goods | 2014 | Exports | 135923 |
| March | Services | 2014 | Exports | 58837 |
| April | Goods | 2014 | Exports | 135556 |
| April | Services | 2014 | Exports | 59468 |
| January | Goods | 2014 | Imports | 193706 |
| January | Services | 2014 | Imports | 38635 |
| February | Goods | 2014 | Imports | 193060 |
| February | Services | 2014 | Imports | 39270 |
| March | Goods | 2014 | Imports | 198973 |
| March | Services | 2014 | Imports | 38908 |
| April | Goods | 2014 | Imports | 199877 |
| April | Services | 2014 | Imports | 39417 |
| January | Goods | 2015 | Exports | 129292 |
| January | Services | 2015 | Exports | 59733 |
| February | Goods | 2015 | Exports | 126329 |
| February | Services | 2015 | Exports | 59560 |
| March | Goods | 2015 | Exports | 127183 |
| March | Services | 2015 | Exports | 59583 |
| April | Goods | 2015 | Exports | 129376 |
| April | Services | 2015 | Exports | 59766 |
| January | Goods | 2015 | Imports | 192242 |
| January | Services | 2015 | Imports | 40371 |
| February | Goods | 2015 | Imports | 184370 |
| February | Services | 2015 | Imports | 40057 |
| March | Goods | 2015 | Imports | 198347 |
| March | Services | 2015 | Imports | 40581 |
| April | Goods | 2015 | Imports | 190967 |
| April | Services | 2015 | Imports | 40432 |
Summarise the data
trade_summary <- trade_tidy %>% group_by(Period, Type, TradeType) %>% summarise(Avg=mean(Trades)) %>% arrange(desc(Avg))
kable(trade_summary)
| Period | Type | TradeType | Avg |
|---|---|---|---|
| April | Goods | Imports | 193450.33 |
| April | Goods | Exports | 132133.33 |
| April | Services | Exports | 58608.67 |
| April | Services | Imports | 39389.00 |
| February | Goods | Imports | 190387.00 |
| February | Goods | Exports | 130060.33 |
| February | Services | Exports | 58038.00 |
| February | Services | Imports | 39087.33 |
| January | Goods | Imports | 192564.33 |
| January | Goods | Exports | 131419.33 |
| January | Services | Exports | 58585.67 |
| January | Services | Imports | 39000.33 |
| March | Goods | Imports | 194548.67 |
| March | Goods | Exports | 131066.33 |
| March | Services | Exports | 58355.67 |
| March | Services | Imports | 39142.33 |
ggplot(trade_summary, aes(x=TradeType, y=Avg, fill=TradeType)) + geom_bar(stat = "identity") + facet_grid(Type ~ Period) + labs(x = "Export/Import", y = "Average") + ggtitle("Three Year(2013-2014) Average Span")
From examining the plot above, it shows more services are exported than goods. However number of goods exported and imported are much higher than the services.
kable(trade_summary %>% filter(Type == "Goods" & TradeType == "Exports"))
| Period | Type | TradeType | Avg |
|---|---|---|---|
| April | Goods | Exports | 132133.3 |
| February | Goods | Exports | 130060.3 |
| January | Goods | Exports | 131419.3 |
| March | Goods | Exports | 131066.3 |
kable(trade_summary %>% filter(Type == "Goods" & TradeType == "Imports"))
| Period | Type | TradeType | Avg |
|---|---|---|---|
| April | Goods | Imports | 193450.3 |
| February | Goods | Imports | 190387.0 |
| January | Goods | Imports | 192564.3 |
| March | Goods | Imports | 194548.7 |
kable(trade_summary %>% filter(Type == "Services" & TradeType == "Exports"))
| Period | Type | TradeType | Avg |
|---|---|---|---|
| April | Services | Exports | 58608.67 |
| February | Services | Exports | 58038.00 |
| January | Services | Exports | 58585.67 |
| March | Services | Exports | 58355.67 |
kable(trade_summary %>% filter(Type == "Services" & TradeType == "Imports"))
| Period | Type | TradeType | Avg |
|---|---|---|---|
| April | Services | Imports | 39389.00 |
| February | Services | Imports | 39087.33 |
| January | Services | Imports | 39000.33 |
| March | Services | Imports | 39142.33 |