setwd("~/Desktop/IS607/Data-607/Project 2")
library(stringr)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
##
## combine, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(scales)
Your task is to:
Choose any three of the “wide” datasets identified in the Week 6 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!)
RawData <- read.xls("UntidyCases.xlsx")
RawData2 <- rename(RawData, Region_1 = Region.1,
Region_2 = Region.2,
Region_3 = Region.3,
Region_4 = Region.4,
Region_5 = Region.5)
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace
RawData3 <- subset(RawData2, select = -TOTAL)
RawData4 <- RawData3[RawData3$Month != "TOTAL",]
RawData4$Month<-str_replace_all(RawData4$Month, "April", 4)
RawData4$Month<-str_replace_all(RawData4$Month, "May", 5)
RawData4$Month<-str_replace_all(RawData4$Month, "June", 6)
RawData4$Month<-str_replace_all(RawData4$Month, "July", 7)
RawData4$Month<-str_replace_all(RawData4$Month, "August", 8)
RawData4$Month<-str_replace_all(RawData4$Month, "September", 9)
RawData4$Month<-str_replace_all(RawData4$Month, "October", 10)
RawData4$Month<-str_replace_all(RawData4$Month, "November", 11)
RawData4$Month<-str_replace_all(RawData4$Month, "December", 12)
RawData5 <- mutate(RawData4, MonthNum = as.numeric(RawData4$Month) * 1)
FinalCases <- gather(RawData5, region, value, `Region_1`:`Region_5`)
ggplot(data=FinalCases, aes(x=FinalCases$MonthNum, y=FinalCases$value,
group = FinalCases$region, colour = FinalCases$region, label = FinalCases$value)) +
geom_line() + ggtitle("Count of Cases by Month") + labs(x="Month",y="Case Count") + geom_label()
For every time point region 3 has the highest case count. Case counts generally increase the most from April to May and remain high until December. Regions 1 and 4 have the smallest amounts of cases for each time point.
RawData <- read.xls("Train.xlsx")
colnames(RawData)[1] <- "Train"
colnames(RawData)[2] <- "Status"
RawData[2, 1] <- "SHINKANSEN"
RawData[4, 1] <- "TGV"
RawDataUnpiv <- gather(RawData, "City", "Count", 3:7)
RawDataUnpiv$City <- str_replace(RawDataUnpiv$City, "[.]", " ")
FinalTrain <- spread(RawDataUnpiv, Status, Count)
DelayCityTrain <- FinalTrain %>%
group_by(Train, City) %>%
summarise(TotalDelayed=sum(Delayed),TotalOnTime=sum(`On Time`), PercentDelayed=round((TotalDelayed/(TotalDelayed+TotalOnTime))*100, 2))
DelayCityTrain
## Source: local data frame [10 x 5]
## Groups: Train [?]
##
## Train City TotalDelayed TotalOnTime PercentDelayed
## <fctr> <chr> <int> <int> <dbl>
## 1 SHINKANSEN Atlantis 62 497 11.09
## 2 SHINKANSEN El Dorado 12 221 5.15
## 3 SHINKANSEN Hyperborea 20 212 8.62
## 4 SHINKANSEN Narnia 102 503 16.86
## 5 SHINKANSEN Valhalla 305 1841 14.21
## 6 TGV Atlantis 117 694 14.43
## 7 TGV El Dorado 415 4840 7.90
## 8 TGV Hyperborea 65 383 14.51
## 9 TGV Narnia 129 320 28.73
## 10 TGV Valhalla 61 201 23.28
Shinkansen was the most reliable service for each city examined with delays ranging from 5.2% - 16.9%. For TGV the range of delays was 7.9% - 28.7%.
DelayOverall <- FinalTrain %>%
group_by(Train) %>%
summarise(TotalDelayed=sum(Delayed),TotalOnTime=sum(`On Time`), PercentDelayed=round((TotalDelayed/(TotalDelayed+TotalOnTime))*100, 2))
DelayOverall
## # A tibble: 2 × 4
## Train TotalDelayed TotalOnTime PercentDelayed
## <fctr> <int> <int> <dbl>
## 1 SHINKANSEN 501 3274 13.27
## 2 TGV 787 6438 10.89
Overall the trend reverses and the most reliable train service was TGV at 10.9% delayed. Shinkansen was at 13.3% delayed overall.
DelayCity<- FinalTrain %>%
group_by(City) %>%
summarise(TotalDelayed=sum(Delayed),TotalOnTime=sum(`On Time`), PercentDelayed=round((TotalDelayed/(TotalDelayed+TotalOnTime))*100, 2))
DelayCity
## # A tibble: 5 × 4
## City TotalDelayed TotalOnTime PercentDelayed
## <chr> <int> <int> <dbl>
## 1 Atlantis 179 1191 13.07
## 2 El Dorado 427 5061 7.78
## 3 Hyperborea 85 595 12.50
## 4 Narnia 231 823 21.92
## 5 Valhalla 366 2042 15.20
Overall, El Dorado had the most ontime service with only 7.8% of the trains delayed. Narnia had the worst on-time performace with 21.9% delayed.
Data taken from http://www.pewforum.org/religious-landscape-study/income-distribution/.
RawData <- read.xls("ReligionIncome.xlsx")
RawData2 <- rename(RawData, religion = Religious.tradition, "<$30k" = Less.than..30.000,
"$30,000-49,999" = X.30.000..49.999,
"$50,000-99,999" = X.50.000..99.999,
">$100k" = X.100.000.or.more)
RawData3 <- mutate(RawData2,
"<$30k_n" = round(`<$30k` * RawData2$Sample, 0),
"$30,000-49,999_n" = round(`$30,000-49,999` * RawData2$Sample, 0),
"$50,000-99,999_n" = round(`$50,000-99,999` * RawData2$Sample, 0),
">$100k_n" = round(`>$100k` * RawData2$Sample, 0)
)
RawData4 <- subset(RawData3, select = -c(`<$30k`,`$30,000-49,999`,`$50,000-99,999`, `>$100k` , Sample))
FinalReligion <- gather(RawData4, income, frequency, -religion)
FinalReligion$income <- str_replace(FinalReligion$income, "_n", "")
FinalReligion$religion <- str_replace_all(FinalReligion$religion, " \\(religious \\\\nones\\\\\\)", "")
ggplot() +
geom_bar(data = FinalReligion,
aes(x = FinalReligion$religion, y = FinalReligion$frequency, fill = FinalReligion$income),
position = "fill", stat = "identity") + labs(x="Religion",y="Frequency") + scale_y_continuous(labels = percent_format()) + coord_flip()
From the graph we can see that religious tradition clearly varies by income level. We can see that for the highest income category (>$100k), ‘Jewish’ followed by ‘Hindu’ have the highest proportions. If we examine the lowest income category (<$30k) we see that ‘Historical Black Protestant’ followed by ‘Jehovah’s Witness’ have the highest proportions.