DATA 607 - Project 2

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RCurl)

## Loading required package: bitops

library(magrittr)
library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.2     v forcats 0.4.0
## v readr   1.3.1

## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x tidyr::complete()  masks RCurl::complete()
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()

library(ggplot2)

#Data set 1: This data relates to the ridership figures across various modes of transport

mbta<-read_csv("https://raw.githubusercontent.com/chitrarth2018/607-Project-2/master/mbta_fin.csv", col_names = TRUE)

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   mode = col_character()
## )

## See spec(...) for full column specifications.

#Deleting the first column of the table as it contained unwanted serial numbers
mbta<-mbta[-1]

#transforming the table using the tiydverse functions gather, spread, and separate
#Step - 1:To lengthen the wide data by creating new columns - Period, and Num_rider. This gives us the           Period as values in columns instead of in column names  
#Step - 2:To spread the dataset using the Mode column as key and Num_rider as the value column. 
#Step - 3:To separate the period column into year and month
#Step - 4:To delete some of the unwanted columns

mbta_v1<-mbta%>%gather(Period, Num_rider,2:59)%>%spread(mode,Num_rider)%>%separate(Period,into=c("Year", "Month"), sep="-")%>%select(-3,-9,-12)%>%mutate(Quarter= ifelse(Month %in% c('01', '02', '03'),"Q1",ifelse(Month %in% c('04', '05', '06'),"Q2",ifelse(Month %in% c('07', '08', '09'),"Q3","Q4"))))

#performing analysis to establish time based trends in the ridership data across different modes
#computing yearwise quartely totals to plot a trend chart
mbta_qtr<-mbta_v1%>%group_by(Year, Quarter)%>%summarize(qtr_riders_boat=sum(Boat), qtr_riders_bus=sum(Bus), qtr_riders_crail=sum(`Commuter Rail`), qtr_riders_hrail=sum(`Heavy Rail`),qtr_riders_lrail=sum(`Light Rail`), qtr_riders_prbus=sum(`Private Bus`), qtr_riders_tt=sum(`Trackless Trolley`))%>%unite(Period, Year, Quarter, sep="",remove=FALSE, na.rm=FALSE)

mbta_qtr$Per_num<-seq(1:20)
mbta_qtr=mbta_qtr[-c(20),]

par(pch=22, col="red") # plotting symbol and color
par(mfrow=c(2,4)) # all plots on one page
plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_boat, type="n",
main="Qtrly Boat Ridership",
ylab="Ridership Boat")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_boat, type="o")

plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_bus, type="n",
main="Qtrly Bus Ridership", ylab="Ridership Bus")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_bus, type="o")

plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_crail, type="n",
main="Qtrly CRail Ridership", ylab="Ridership CRail")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_crail, type="o")

plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_hrail, type="n",
main="Qtrly HRail Ridership", ylab="Ridership HRail")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_hrail, type="o")

plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_lrail, type="n",
main="Qtrly LRail Ridership", ylab="Ridership LHRail")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_lrail, type="o")

plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_prbus, type="n",
main="Qtrly Pr Bus Ridership", ylab="Ridership PR Bus")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_prbus, type="o")

plot(mbta_qtr$Per_num, mbta_qtr$qtr_riders_tt, type="n",
main="Qtrly Trolley Ridership", ylab="Ridership Trolley")
lines(mbta_qtr$Per_num, mbta_qtr$qtr_riders_tt, type="o")

#conclusions from eyeballing the charts
#1. The boats and bus ridership data shows high amounts of seasonality with the ridership figures peaking in Q3 every year. While the boats ridership varies in a more less fixed range - bus ridership shows an increasing trend overtime. Also, while the total bus ridership is on an increase, the private bus ridership has gone reduced over time.
#2. While the commuter (CRail) and light rail (Lrail) ridership has shown significant reductions - the heavy rail (HRail) ridership is showing increasing trends

#Dataset 2: The data relates to under 5 years child mortality across 197 countries. The data in the table is in a wide format and our first task is to bring the data in a long format.

u5mr<-read_csv("https://raw.githubusercontent.com/chitrarth2018/607-Project-2/master/unicef-u5mr.csv", col_names = TRUE)

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   CountryName = col_character()
## )

## See spec(...) for full column specifications.

#Converting the wide data into long data and then splitting the year variable to separate out the year. Thereafter data is grouped based on country names

u5mr_v1<-u5mr%>%gather(Year, Mor_Rate, 2:67)%>%separate(Year,into=c("title","Yr_Mor"),sep="-", remove=TRUE)%>%select(-2)%>%group_by(CountryName)

#Mean mortality rate is computed for each country for the 66 year period
u5mr_mean<-u5mr_v1%>%summarize(mor_mean=mean(Mor_Rate, na.rm=TRUE))%>%arrange(-mor_mean)
head(u5mr_mean)

## # A tibble: 6 x 2
##   CountryName  mor_mean
##   <chr>           <dbl>
## 1 Mali             274.
## 2 Sierra Leone     268.
## 3 Niger            262.
## 4 Burkina Faso     247.
## 5 Guinea           235.
## 6 Liberia          224.

#Top 10 Countries having highest mean mortality rate over the 66 year period are separated
u5mr_top10<-u5mr_mean%>%slice(1:10)
view(u5mr_top10)

#Annual moratality rate time series for these 10 countries is separated and the data is converted from wide to long and then year variable is split to separate year 
rate_top10<-merge(u5mr_top10,u5mr,by="CountryName", all.x=TRUE)%>%select(-2)
rate_top10_v1<-rate_top10%>%gather(Year, Mor_Rate, 2:67)%>%separate(Year,into=c("title","Yr_Mor"),sep="-", remove=TRUE)%>%select(-2)%>%arrange(CountryName)

#Mortality rate trends are plotted for these 10 countries.
rate_top10_v1%>%ggplot(aes(x=as.numeric(Yr_Mor), y=Mor_Rate, group=CountryName, color=CountryName)) +
    geom_line()

## Warning: Removed 108 rows containing missing values (geom_path).

#conclusion - as can be seen from the plot that the mortality rates for each of the countries have consistently decreased except for Niger which showed an increase around 1985 - 1990 but subsequently rates decreased for Niger also.

#The data set 3 relates to “Estimates for under-five, infant and neonatal mortality”. The dataset is in wide form and contains six value variables of interest: under-five (0-4 years) mortality, infant (0-1 years) mortality, neonatal (0-1 month) mortality, as well as the number of under-five, infant, and neonatal deaths.

mort_indicators<-read_csv("https://raw.githubusercontent.com/chitrarth2018/607-Project-2/master/RatesDeaths_AllIndicators.csv", col_names = TRUE)

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `ISO Code` = col_character(),
##   CountryName = col_character(),
##   Uncertaintybounds = col_character(),
##   `NMR-1950` = col_logical(),
##   `Neonatal-Deaths-1950` = col_logical()
## )

## See spec(...) for full column specifications.

head(mort_indicators)

## # A tibble: 6 x 399
##   `ISO Code` CountryName Uncertaintyboun~ `U5MR-1950` `U5MR-1951` `U5MR-1952`
##   <chr>      <chr>       <chr>                  <dbl>       <dbl>       <dbl>
## 1 AFG        Afghanistan Lower                     NA          NA          NA
## 2 AFG        Afghanistan Median                    NA          NA          NA
## 3 AFG        Afghanistan Upper                     NA          NA          NA
## 4 AGO        Angola      Lower                     NA          NA          NA
## 5 AGO        Angola      Median                    NA          NA          NA
## 6 AGO        Angola      Upper                     NA          NA          NA
## # ... with 393 more variables: `U5MR-1953` <dbl>, `U5MR-1954` <dbl>,
## #   `U5MR-1955` <dbl>, `U5MR-1956` <dbl>, `U5MR-1957` <dbl>, `U5MR-1958` <dbl>,
## #   `U5MR-1959` <dbl>, `U5MR-1960` <dbl>, `U5MR-1961` <dbl>, `U5MR-1962` <dbl>,
## #   `U5MR-1963` <dbl>, `U5MR-1964` <dbl>, `U5MR-1965` <dbl>, `U5MR-1966` <dbl>,
## #   `U5MR-1967` <dbl>, `U5MR-1968` <dbl>, `U5MR-1969` <dbl>, `U5MR-1970` <dbl>,
## #   `U5MR-1971` <dbl>, `U5MR-1972` <dbl>, `U5MR-1973` <dbl>, `U5MR-1974` <dbl>,
## #   `U5MR-1975` <dbl>, `U5MR-1976` <dbl>, `U5MR-1977` <dbl>, `U5MR-1978` <dbl>,
## #   `U5MR-1979` <dbl>, `U5MR-1980` <dbl>, `U5MR-1981` <dbl>, `U5MR-1982` <dbl>,
## #   `U5MR-1983` <dbl>, `U5MR-1984` <dbl>, `U5MR-1985` <dbl>, `U5MR-1986` <dbl>,
## #   `U5MR-1987` <dbl>, `U5MR-1988` <dbl>, `U5MR-1989` <dbl>, `U5MR-1990` <dbl>,
## #   `U5MR-1991` <dbl>, `U5MR-1992` <dbl>, `U5MR-1993` <dbl>, `U5MR-1994` <dbl>,
## #   `U5MR-1995` <dbl>, `U5MR-1996` <dbl>, `U5MR-1997` <dbl>, `U5MR-1998` <dbl>,
## #   `U5MR-1999` <dbl>, `U5MR-2000` <dbl>, `U5MR-2001` <dbl>, `U5MR-2002` <dbl>,
## #   `U5MR-2003` <dbl>, `U5MR-2004` <dbl>, `U5MR-2005` <dbl>, `U5MR-2006` <dbl>,
## #   `U5MR-2007` <dbl>, `U5MR-2008` <dbl>, `U5MR-2009` <dbl>, `U5MR-2010` <dbl>,
## #   `U5MR-2011` <dbl>, `U5MR-2012` <dbl>, `U5MR-2013` <dbl>, `U5MR-2014` <dbl>,
## #   `U5MR-2015` <dbl>, `IMR-1950` <dbl>, `IMR-1951` <dbl>, `IMR-1952` <dbl>,
## #   `IMR-1953` <dbl>, `IMR-1954` <dbl>, `IMR-1955` <dbl>, `IMR-1956` <dbl>,
## #   `IMR-1957` <dbl>, `IMR-1958` <dbl>, `IMR-1959` <dbl>, `IMR-1960` <dbl>,
## #   `IMR-1961` <dbl>, `IMR-1962` <dbl>, `IMR-1963` <dbl>, `IMR-1964` <dbl>,
## #   `IMR-1965` <dbl>, `IMR-1966` <dbl>, `IMR-1967` <dbl>, `IMR-1968` <dbl>,
## #   `IMR-1969` <dbl>, `IMR-1970` <dbl>, `IMR-1971` <dbl>, `IMR-1972` <dbl>,
## #   `IMR-1973` <dbl>, `IMR-1974` <dbl>, `IMR-1975` <dbl>, `IMR-1976` <dbl>,
## #   `IMR-1977` <dbl>, `IMR-1978` <dbl>, `IMR-1979` <dbl>, `IMR-1980` <dbl>,
## #   `IMR-1981` <dbl>, `IMR-1982` <dbl>, `IMR-1983` <dbl>, `IMR-1984` <dbl>,
## #   `IMR-1985` <dbl>, `IMR-1986` <dbl>, ...

#filtering the median values for all the countries
mort_indicators_v1<-mort_indicators%>%filter(Uncertaintybounds %in% c("Median"))
head(mort_indicators_v1)

## # A tibble: 6 x 399
##   `ISO Code` CountryName Uncertaintyboun~ `U5MR-1950` `U5MR-1951` `U5MR-1952`
##   <chr>      <chr>       <chr>                  <dbl>       <dbl>       <dbl>
## 1 AFG        Afghanistan Median                    NA          NA          NA
## 2 AGO        Angola      Median                    NA          NA          NA
## 3 ALB        Albania     Median                    NA          NA          NA
## 4 AND        Andorra     Median                    NA          NA          NA
## 5 ARE        United Ara~ Median                    NA          NA          NA
## 6 ARG        Argentina   Median                    NA          NA          NA
## # ... with 393 more variables: `U5MR-1953` <dbl>, `U5MR-1954` <dbl>,
## #   `U5MR-1955` <dbl>, `U5MR-1956` <dbl>, `U5MR-1957` <dbl>, `U5MR-1958` <dbl>,
## #   `U5MR-1959` <dbl>, `U5MR-1960` <dbl>, `U5MR-1961` <dbl>, `U5MR-1962` <dbl>,
## #   `U5MR-1963` <dbl>, `U5MR-1964` <dbl>, `U5MR-1965` <dbl>, `U5MR-1966` <dbl>,
## #   `U5MR-1967` <dbl>, `U5MR-1968` <dbl>, `U5MR-1969` <dbl>, `U5MR-1970` <dbl>,
## #   `U5MR-1971` <dbl>, `U5MR-1972` <dbl>, `U5MR-1973` <dbl>, `U5MR-1974` <dbl>,
## #   `U5MR-1975` <dbl>, `U5MR-1976` <dbl>, `U5MR-1977` <dbl>, `U5MR-1978` <dbl>,
## #   `U5MR-1979` <dbl>, `U5MR-1980` <dbl>, `U5MR-1981` <dbl>, `U5MR-1982` <dbl>,
## #   `U5MR-1983` <dbl>, `U5MR-1984` <dbl>, `U5MR-1985` <dbl>, `U5MR-1986` <dbl>,
## #   `U5MR-1987` <dbl>, `U5MR-1988` <dbl>, `U5MR-1989` <dbl>, `U5MR-1990` <dbl>,
## #   `U5MR-1991` <dbl>, `U5MR-1992` <dbl>, `U5MR-1993` <dbl>, `U5MR-1994` <dbl>,
## #   `U5MR-1995` <dbl>, `U5MR-1996` <dbl>, `U5MR-1997` <dbl>, `U5MR-1998` <dbl>,
## #   `U5MR-1999` <dbl>, `U5MR-2000` <dbl>, `U5MR-2001` <dbl>, `U5MR-2002` <dbl>,
## #   `U5MR-2003` <dbl>, `U5MR-2004` <dbl>, `U5MR-2005` <dbl>, `U5MR-2006` <dbl>,
## #   `U5MR-2007` <dbl>, `U5MR-2008` <dbl>, `U5MR-2009` <dbl>, `U5MR-2010` <dbl>,
## #   `U5MR-2011` <dbl>, `U5MR-2012` <dbl>, `U5MR-2013` <dbl>, `U5MR-2014` <dbl>,
## #   `U5MR-2015` <dbl>, `IMR-1950` <dbl>, `IMR-1951` <dbl>, `IMR-1952` <dbl>,
## #   `IMR-1953` <dbl>, `IMR-1954` <dbl>, `IMR-1955` <dbl>, `IMR-1956` <dbl>,
## #   `IMR-1957` <dbl>, `IMR-1958` <dbl>, `IMR-1959` <dbl>, `IMR-1960` <dbl>,
## #   `IMR-1961` <dbl>, `IMR-1962` <dbl>, `IMR-1963` <dbl>, `IMR-1964` <dbl>,
## #   `IMR-1965` <dbl>, `IMR-1966` <dbl>, `IMR-1967` <dbl>, `IMR-1968` <dbl>,
## #   `IMR-1969` <dbl>, `IMR-1970` <dbl>, `IMR-1971` <dbl>, `IMR-1972` <dbl>,
## #   `IMR-1973` <dbl>, `IMR-1974` <dbl>, `IMR-1975` <dbl>, `IMR-1976` <dbl>,
## #   `IMR-1977` <dbl>, `IMR-1978` <dbl>, `IMR-1979` <dbl>, `IMR-1980` <dbl>,
## #   `IMR-1981` <dbl>, `IMR-1982` <dbl>, `IMR-1983` <dbl>, `IMR-1984` <dbl>,
## #   `IMR-1985` <dbl>, `IMR-1986` <dbl>, ...

#converting data into long form and then creating type variable that signifies the different types of deaths
mort_indicators_v2<-mort_indicators_v1%>%gather(dummy,value,4:399)%>%mutate(type = str_extract(dummy,"U5MR|IMR|NMR|Infant-Deaths|Neonatal-Deaths|Under-five-Deaths"))%>%mutate(year=str_extract(dummy,"\\d*$"))%>%select(-1,-3,-4)%>%arrange(CountryName)

head(mort_indicators_v2)

## # A tibble: 6 x 4
##   CountryName value type  year 
##   <chr>       <dbl> <chr> <chr>
## 1 Afghanistan    NA U5MR  1950 
## 2 Afghanistan    NA U5MR  1951 
## 3 Afghanistan    NA U5MR  1952 
## 4 Afghanistan    NA U5MR  1953 
## 5 Afghanistan    NA U5MR  1954 
## 6 Afghanistan    NA U5MR  1955

#analysis has to be done to compute the under-five, infant, and neonatal deaths that occurred in the world in years 1990, 1995, 2000, 2005, and 2015

mort_summary<-mort_indicators_v2%>%group_by(year,type)%>%filter(year %in% c("1990", "1995", "2000", "2005", "2015"))%>%filter(type %in% c("U5MR","Infant-Deaths","Neonatal-Deaths"))%>%summarize(total=sum(value, na.rm=TRUE))
view(mort_summary)

DATA 607 - Project 2

Chitrarth Kaushik

3/7/2020