setwd("~/Desktop/IS607/Data-607/Project 2")

library(stringr) 
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
## 
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:gdata':
## 
##     combine, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(scales)

Description of Task

Your task is to:

  1. Choose any three of the “wide” datasets identified in the Week 6 Discussion items. (You may use your own dataset; please don’t use my Sample Post dataset, since that was used in your Week 6 assignment!)

  2. For each of the three chosen datasets:
  1. Please include in your homework submission, for each of the three chosen datasets:  The URL to the .Rmd file in your GitHub repository, and  The URL for your rpubs.com web page.

Untidy dataset 1 - Crosstab query from Jose Zuniga

Read in raw data from Excel file

RawData <- read.xls("UntidyCases.xlsx")

Rename Column Headings

RawData2 <- rename(RawData, Region_1 = Region.1,
                 Region_2 = Region.2, 
                 Region_3 = Region.3, 
                 Region_4 = Region.4, 
                 Region_5 = Region.5)
## Warning: failed to assign NativeSymbolInfo for env since env is already
## defined in the 'lazyeval' namespace

Remove Row TOTAL to right

RawData3 <- subset(RawData2, select = -TOTAL)

Remove Col TOTAL at bottom

RawData4 <- RawData3[RawData3$Month != "TOTAL",]

Replace Month string with number so plot is correct

RawData4$Month<-str_replace_all(RawData4$Month, "April", 4)
RawData4$Month<-str_replace_all(RawData4$Month, "May", 5)
RawData4$Month<-str_replace_all(RawData4$Month, "June", 6)
RawData4$Month<-str_replace_all(RawData4$Month, "July", 7)
RawData4$Month<-str_replace_all(RawData4$Month, "August", 8)
RawData4$Month<-str_replace_all(RawData4$Month, "September", 9)
RawData4$Month<-str_replace_all(RawData4$Month, "October", 10)
RawData4$Month<-str_replace_all(RawData4$Month, "November", 11)
RawData4$Month<-str_replace_all(RawData4$Month, "December", 12)

RawData5 <- mutate(RawData4, MonthNum = as.numeric(RawData4$Month) * 1)

Unpivot data so we have one row per observation

FinalCases <- gather(RawData5, region, value, `Region_1`:`Region_5`) 

Analysis

From Jose: Compare Monthly citizenship for given regions

After researching more this is actually case counts by region (not population)

ggplot(data=FinalCases, aes(x=FinalCases$MonthNum, y=FinalCases$value, 
          group = FinalCases$region, colour = FinalCases$region, label = FinalCases$value)) +
      geom_line() +  ggtitle("Count of Cases by Month") + labs(x="Month",y="Case Count") + geom_label()

Summary of results from graph

For every time point region 3 has the highest case count. Case counts generally increase the most from April to May and remain high until December. Regions 1 and 4 have the smallest amounts of cases for each time point.

Untidy dataset 2 - Trains data from Richard Pantoliano

Read in raw data from Excel file

RawData <- read.xls("Train.xlsx")

Rename missing header row

colnames(RawData)[1] <- "Train"
colnames(RawData)[2] <- "Status"

Rename missing train names

RawData[2, 1] <- "SHINKANSEN"
RawData[4, 1] <- "TGV"

Unpivot data

RawDataUnpiv <- gather(RawData, "City", "Count", 3:7)

Rename city with . in name

RawDataUnpiv$City <- str_replace(RawDataUnpiv$City, "[.]", " ")

Finally move train status into its own variable for analysis

FinalTrain <-  spread(RawDataUnpiv, Status, Count)

Analysis

Questions From Richard

1. Which was the more reliable service for each city and by how much?

DelayCityTrain <- FinalTrain %>% 
  group_by(Train, City) %>%
  summarise(TotalDelayed=sum(Delayed),TotalOnTime=sum(`On Time`), PercentDelayed=round((TotalDelayed/(TotalDelayed+TotalOnTime))*100, 2))

DelayCityTrain
## Source: local data frame [10 x 5]
## Groups: Train [?]
## 
##         Train       City TotalDelayed TotalOnTime PercentDelayed
##        <fctr>      <chr>        <int>       <int>          <dbl>
## 1  SHINKANSEN   Atlantis           62         497          11.09
## 2  SHINKANSEN  El Dorado           12         221           5.15
## 3  SHINKANSEN Hyperborea           20         212           8.62
## 4  SHINKANSEN     Narnia          102         503          16.86
## 5  SHINKANSEN   Valhalla          305        1841          14.21
## 6         TGV   Atlantis          117         694          14.43
## 7         TGV  El Dorado          415        4840           7.90
## 8         TGV Hyperborea           65         383          14.51
## 9         TGV     Narnia          129         320          28.73
## 10        TGV   Valhalla           61         201          23.28

Shinkansen was the most reliable service for each city examined with delays ranging from 5.2% - 16.9%. For TGV the range of delays was 7.9% - 28.7%.

2. Which was the most reliable train service, overall?

DelayOverall <- FinalTrain %>% 
  group_by(Train) %>%
  summarise(TotalDelayed=sum(Delayed),TotalOnTime=sum(`On Time`), PercentDelayed=round((TotalDelayed/(TotalDelayed+TotalOnTime))*100, 2))

DelayOverall
## # A tibble: 2 × 4
##        Train TotalDelayed TotalOnTime PercentDelayed
##       <fctr>        <int>       <int>          <dbl>
## 1 SHINKANSEN          501        3274          13.27
## 2        TGV          787        6438          10.89

Overall the trend reverses and the most reliable train service was TGV at 10.9% delayed. Shinkansen was at 13.3% delayed overall.

3. In aggregate, which city had the most reliably on-time train service

DelayCity<- FinalTrain %>% 
  group_by(City) %>%
  summarise(TotalDelayed=sum(Delayed),TotalOnTime=sum(`On Time`), PercentDelayed=round((TotalDelayed/(TotalDelayed+TotalOnTime))*100, 2))

DelayCity
## # A tibble: 5 × 4
##         City TotalDelayed TotalOnTime PercentDelayed
##        <chr>        <int>       <int>          <dbl>
## 1   Atlantis          179        1191          13.07
## 2  El Dorado          427        5061           7.78
## 3 Hyperborea           85         595          12.50
## 4     Narnia          231         823          21.92
## 5   Valhalla          366        2042          15.20

Overall, El Dorado had the most ontime service with only 7.8% of the trains delayed. Narnia had the worst on-time performace with 21.9% delayed.

Untidy dataset 3 - Religion and income data from Marco Siqueira Campos

Data taken from http://www.pewforum.org/religious-landscape-study/income-distribution/.

Read in raw data from Excel file

RawData <- read.xls("ReligionIncome.xlsx")

Rename Column Headings

RawData2 <- rename(RawData, religion = Religious.tradition, "<$30k" = Less.than..30.000,
                   "$30,000-49,999" = X.30.000..49.999, 
                   "$50,000-99,999" = X.50.000..99.999, 
                   ">$100k" = X.100.000.or.more)

Get n for each % value of sample

RawData3 <- mutate(RawData2, 
                   "<$30k_n" = round(`<$30k` * RawData2$Sample, 0),
                   "$30,000-49,999_n" = round(`$30,000-49,999` * RawData2$Sample, 0),
                   "$50,000-99,999_n" = round(`$50,000-99,999` * RawData2$Sample, 0),
                   ">$100k_n" = round(`>$100k` * RawData2$Sample, 0)
                   )

Remove Row TOTAL to right

RawData4 <- subset(RawData3, select = -c(`<$30k`,`$30,000-49,999`,`$50,000-99,999`, `>$100k` , Sample))

Unpivot data so we have one row per observation

FinalReligion <- gather(RawData4, income, frequency, -religion)

Clean up some of the values for religion and income

FinalReligion$income <- str_replace(FinalReligion$income, "_n", "")

FinalReligion$religion <- str_replace_all(FinalReligion$religion, " \\(religious \\\\nones\\\\\\)", "")

Analysis from Marco - Analyze the Income by religion

ggplot() + 
  geom_bar(data = FinalReligion,
           aes(x = FinalReligion$religion, y = FinalReligion$frequency, fill = FinalReligion$income),
           position = "fill", stat = "identity") + labs(x="Religion",y="Frequency") + scale_y_continuous(labels = percent_format()) + coord_flip() 

Summary of results from graph

From the graph we can see that religious tradition clearly varies by income level. We can see that for the highest income category (>$100k), ‘Jewish’ followed by ‘Hindu’ have the highest proportions. If we examine the lowest income category (<$30k) we see that ‘Historical Black Protestant’ followed by ‘Jehovah’s Witness’ have the highest proportions.