RpubLink

Package used

  1. library(tidyverse)
  2. library(lubridate)
  3. library(stringr)
  4. library(spelling)
  5. library(DT)
  6. spell_check_files(“607_Week6_P2.rmd”, ignore = character(), lang = “en_US”)

About Data

Data is from US disability, it list number of applications being submitted from online and offline . Objective is to find that if Online service has helped Gov. to get more application through online mode. * Data has lots of problem + Data is wide by Month, AND also Fiscal Year may Not equal to same year.

Data Process

Read Data

# Read in data
ssa <- read_csv("https://raw.githubusercontent.com/Rajwantmishra/msds/master/week5/ssadisability.csv")
## Parsed with column specification:
## cols(
##   .default = col_number(),
##   Fiscal_Year = col_character()
## )
## See spec(...) for full column specifications.
# Take a look at how this was imported
glimpse(ssa)
## Observations: 10
## Variables: 25
## $ Fiscal_Year     <chr> "FY08", "FY09", "FY10", "FY11", "FY12", "FY13"...
## $ Oct_Total       <dbl> 176407, 244781, 286598, 299033, 227456, 224624...
## $ Oct_Internet    <dbl> 15082, 32578, 65533, 92856, 86811, 92542, 9840...
## $ Nov_Total       <dbl> 204287, 181161, 213297, 209553, 200140, 249910...
## $ Nov_Internet    <dbl> 17301, 25620, 50098, 63424, 71175, 107053, 117...
## $ Dec_Total       <dbl> 151687, 176107, 198733, 215239, 254766, 188183...
## $ Dec_Internet    <dbl> 14321, 27174, 44512, 62877, 91424, 79719, 8337...
## $ Jan_Total       <dbl> 162966, 249062, 265665, 264286, 221146, 199588...
## $ Jan_Internet    <dbl> 18391, 57908, 68843, 84944, 85848, 93703, 1253...
## $ Feb_Total       <dbl> 228623, 221368, 225319, 223625, 228519, 219604...
## $ Feb_Internet    <dbl> 26034, 50408, 58465, 71314, 83576, 101878, 108...
## $ Mar_Total       <dbl> 190716, 235360, 243266, 246630, 299267, 285923...
## $ Mar_Internet    <dbl> 21064, 53592, 62198, 77916, 112104, 129415, 11...
## $ Apr_Total       <dbl> 194403, 234304, 298065, 300359, 233685, 224804...
## $ Apr_Internet    <dbl> 22372, 53675, 76573, 94722, 88330, 101619, 112...
## $ May_Total       <dbl> 226549, 281343, 239409, 241673, 239503, 269955...
## $ May_Internet    <dbl> 26337, 65822, 65780, 77603, 93826, 123440, 134...
## $ June_Total      <dbl> 193094, 237329, 231964, 233351, 284136, 223238...
## $ June_Internet   <dbl> 22551, 54285, 67163, 79925, 113613, 104146, 11...
## $ July_Total      <dbl> 181552, 285172, 300442, 292949, 221745, 204072...
## $ July_Internet   <dbl> 22728, 66565, 92957, 105276, 91323, 98326, 106...
## $ August_Total    <dbl> 245429, 240611, 248284, 237555, 298458, 281828...
## $ August_Internet <dbl> 30580, 54915, 75535, 86514, 119795, 135423, 13...
## $ Sept_Internet   <dbl> 24141, 52687, 73403, 103564, 93375, 104270, 10...
## $ Sept_Total      <dbl> 186750, 228692, 238965, 280913, 230648, 214004...
# tbl_df(ssa)
datatable(ssa)

Data Wrangling

  • Make Data long
  • Split the COlumn data into multiple column
  • Convert Months to 3 Char letter months abbreviations
  • Convert Fiscal Year to valid year
  • Building Date by Months and Year and Date
  • Converting it into right date fromat

Working with Date

# Make the dataset long by skipping Fiscal Year column
ssa_long <- gather(ssa, month, applications, -Fiscal_Year)

# Long  data 
datatable(ssa_long)
# Split the Month into  month and application type by "_" to split months and Type of application .(internet and Total )
ssa_long <- separate(ssa_long, month, c("month", "application_method"), sep="_")

# Data after split 
print(ssa_long, n=20)
## # A tibble: 240 x 4
##    Fiscal_Year month application_method applications
##    <chr>       <chr> <chr>                     <dbl>
##  1 FY08        Oct   Total                    176407
##  2 FY09        Oct   Total                    244781
##  3 FY10        Oct   Total                    286598
##  4 FY11        Oct   Total                    299033
##  5 FY12        Oct   Total                    227456
##  6 FY13        Oct   Total                    224624
##  7 FY14        Oct   Total                    206471
##  8 FY15        Oct   Total                    254294
##  9 FY16        Oct   Total                    244599
## 10 FY17        Oct   Total                    173396
## 11 FY08        Oct   Internet                  15082
## 12 FY09        Oct   Internet                  32578
## 13 FY10        Oct   Internet                  65533
## 14 FY11        Oct   Internet                  92856
## 15 FY12        Oct   Internet                  86811
## 16 FY13        Oct   Internet                  92542
## 17 FY14        Oct   Internet                  98400
## 18 FY15        Oct   Internet                 133740
## 19 FY16        Oct   Internet                 125971
## 20 FY17        Oct   Internet                  90325
## # ... with 220 more rows
# Checking Unique Months value , and we note that not all the Months have 3 char length 
unique(ssa_long$month)
##  [1] "Oct"    "Nov"    "Dec"    "Jan"    "Feb"    "Mar"    "Apr"   
##  [8] "May"    "June"   "July"   "August" "Sept"
# Convert month to standard abbreviations , as not all the months are in 3 Char string length. 
ssa_long$month <- str_sub(ssa_long$month,1,3)

# Check value for months 
unique(ssa_long$month)
##  [1] "Oct" "Nov" "Dec" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug"
## [12] "Sep"
# Check Value for Fiscal Year
unique(ssa_long$Fiscal_Year)
##  [1] "FY08" "FY09" "FY10" "FY11" "FY12" "FY13" "FY14" "FY15" "FY16" "FY17"
# Convert Fiscal_Year from alphanumeric strings to actual years
# Replce FY with 20
ssa_long$Fiscal_Year <- str_replace(ssa_long$Fiscal_Year, "FY", "20")

# What values do we now have for months?
unique(ssa_long$Fiscal_Year)
##  [1] "2008" "2009" "2010" "2011" "2012" "2013" "2014" "2015" "2016" "2017"
# Build a date string using the first day of the month
paste('01', ssa_long$month, ssa_long$Fiscal_Year)
##   [1] "01 Oct 2008" "01 Oct 2009" "01 Oct 2010" "01 Oct 2011" "01 Oct 2012"
##   [6] "01 Oct 2013" "01 Oct 2014" "01 Oct 2015" "01 Oct 2016" "01 Oct 2017"
##  [11] "01 Oct 2008" "01 Oct 2009" "01 Oct 2010" "01 Oct 2011" "01 Oct 2012"
##  [16] "01 Oct 2013" "01 Oct 2014" "01 Oct 2015" "01 Oct 2016" "01 Oct 2017"
##  [21] "01 Nov 2008" "01 Nov 2009" "01 Nov 2010" "01 Nov 2011" "01 Nov 2012"
##  [26] "01 Nov 2013" "01 Nov 2014" "01 Nov 2015" "01 Nov 2016" "01 Nov 2017"
##  [31] "01 Nov 2008" "01 Nov 2009" "01 Nov 2010" "01 Nov 2011" "01 Nov 2012"
##  [36] "01 Nov 2013" "01 Nov 2014" "01 Nov 2015" "01 Nov 2016" "01 Nov 2017"
##  [41] "01 Dec 2008" "01 Dec 2009" "01 Dec 2010" "01 Dec 2011" "01 Dec 2012"
##  [46] "01 Dec 2013" "01 Dec 2014" "01 Dec 2015" "01 Dec 2016" "01 Dec 2017"
##  [51] "01 Dec 2008" "01 Dec 2009" "01 Dec 2010" "01 Dec 2011" "01 Dec 2012"
##  [56] "01 Dec 2013" "01 Dec 2014" "01 Dec 2015" "01 Dec 2016" "01 Dec 2017"
##  [61] "01 Jan 2008" "01 Jan 2009" "01 Jan 2010" "01 Jan 2011" "01 Jan 2012"
##  [66] "01 Jan 2013" "01 Jan 2014" "01 Jan 2015" "01 Jan 2016" "01 Jan 2017"
##  [71] "01 Jan 2008" "01 Jan 2009" "01 Jan 2010" "01 Jan 2011" "01 Jan 2012"
##  [76] "01 Jan 2013" "01 Jan 2014" "01 Jan 2015" "01 Jan 2016" "01 Jan 2017"
##  [81] "01 Feb 2008" "01 Feb 2009" "01 Feb 2010" "01 Feb 2011" "01 Feb 2012"
##  [86] "01 Feb 2013" "01 Feb 2014" "01 Feb 2015" "01 Feb 2016" "01 Feb 2017"
##  [91] "01 Feb 2008" "01 Feb 2009" "01 Feb 2010" "01 Feb 2011" "01 Feb 2012"
##  [96] "01 Feb 2013" "01 Feb 2014" "01 Feb 2015" "01 Feb 2016" "01 Feb 2017"
## [101] "01 Mar 2008" "01 Mar 2009" "01 Mar 2010" "01 Mar 2011" "01 Mar 2012"
## [106] "01 Mar 2013" "01 Mar 2014" "01 Mar 2015" "01 Mar 2016" "01 Mar 2017"
## [111] "01 Mar 2008" "01 Mar 2009" "01 Mar 2010" "01 Mar 2011" "01 Mar 2012"
## [116] "01 Mar 2013" "01 Mar 2014" "01 Mar 2015" "01 Mar 2016" "01 Mar 2017"
## [121] "01 Apr 2008" "01 Apr 2009" "01 Apr 2010" "01 Apr 2011" "01 Apr 2012"
## [126] "01 Apr 2013" "01 Apr 2014" "01 Apr 2015" "01 Apr 2016" "01 Apr 2017"
## [131] "01 Apr 2008" "01 Apr 2009" "01 Apr 2010" "01 Apr 2011" "01 Apr 2012"
## [136] "01 Apr 2013" "01 Apr 2014" "01 Apr 2015" "01 Apr 2016" "01 Apr 2017"
## [141] "01 May 2008" "01 May 2009" "01 May 2010" "01 May 2011" "01 May 2012"
## [146] "01 May 2013" "01 May 2014" "01 May 2015" "01 May 2016" "01 May 2017"
## [151] "01 May 2008" "01 May 2009" "01 May 2010" "01 May 2011" "01 May 2012"
## [156] "01 May 2013" "01 May 2014" "01 May 2015" "01 May 2016" "01 May 2017"
## [161] "01 Jun 2008" "01 Jun 2009" "01 Jun 2010" "01 Jun 2011" "01 Jun 2012"
## [166] "01 Jun 2013" "01 Jun 2014" "01 Jun 2015" "01 Jun 2016" "01 Jun 2017"
## [171] "01 Jun 2008" "01 Jun 2009" "01 Jun 2010" "01 Jun 2011" "01 Jun 2012"
## [176] "01 Jun 2013" "01 Jun 2014" "01 Jun 2015" "01 Jun 2016" "01 Jun 2017"
## [181] "01 Jul 2008" "01 Jul 2009" "01 Jul 2010" "01 Jul 2011" "01 Jul 2012"
## [186] "01 Jul 2013" "01 Jul 2014" "01 Jul 2015" "01 Jul 2016" "01 Jul 2017"
## [191] "01 Jul 2008" "01 Jul 2009" "01 Jul 2010" "01 Jul 2011" "01 Jul 2012"
## [196] "01 Jul 2013" "01 Jul 2014" "01 Jul 2015" "01 Jul 2016" "01 Jul 2017"
## [201] "01 Aug 2008" "01 Aug 2009" "01 Aug 2010" "01 Aug 2011" "01 Aug 2012"
## [206] "01 Aug 2013" "01 Aug 2014" "01 Aug 2015" "01 Aug 2016" "01 Aug 2017"
## [211] "01 Aug 2008" "01 Aug 2009" "01 Aug 2010" "01 Aug 2011" "01 Aug 2012"
## [216] "01 Aug 2013" "01 Aug 2014" "01 Aug 2015" "01 Aug 2016" "01 Aug 2017"
## [221] "01 Sep 2008" "01 Sep 2009" "01 Sep 2010" "01 Sep 2011" "01 Sep 2012"
## [226] "01 Sep 2013" "01 Sep 2014" "01 Sep 2015" "01 Sep 2016" "01 Sep 2017"
## [231] "01 Sep 2008" "01 Sep 2009" "01 Sep 2010" "01 Sep 2011" "01 Sep 2012"
## [236] "01 Sep 2013" "01 Sep 2014" "01 Sep 2015" "01 Sep 2016" "01 Sep 2017"
# Convert date to right date format. 
ssa_long$Date <- dmy(paste("01", ssa_long$month, ssa$Fiscal_Year))

# Check final date after applying DMY 
unique(ssa_long$Date)
##   [1] "2008-10-01" "2009-10-01" "2010-10-01" "2011-10-01" "2012-10-01"
##   [6] "2013-10-01" "2014-10-01" "2015-10-01" "2016-10-01" "2017-10-01"
##  [11] "2008-11-01" "2009-11-01" "2010-11-01" "2011-11-01" "2012-11-01"
##  [16] "2013-11-01" "2014-11-01" "2015-11-01" "2016-11-01" "2017-11-01"
##  [21] "2008-12-01" "2009-12-01" "2010-12-01" "2011-12-01" "2012-12-01"
##  [26] "2013-12-01" "2014-12-01" "2015-12-01" "2016-12-01" "2017-12-01"
##  [31] "2008-01-01" "2009-01-01" "2010-01-01" "2011-01-01" "2012-01-01"
##  [36] "2013-01-01" "2014-01-01" "2015-01-01" "2016-01-01" "2017-01-01"
##  [41] "2008-02-01" "2009-02-01" "2010-02-01" "2011-02-01" "2012-02-01"
##  [46] "2013-02-01" "2014-02-01" "2015-02-01" "2016-02-01" "2017-02-01"
##  [51] "2008-03-01" "2009-03-01" "2010-03-01" "2011-03-01" "2012-03-01"
##  [56] "2013-03-01" "2014-03-01" "2015-03-01" "2016-03-01" "2017-03-01"
##  [61] "2008-04-01" "2009-04-01" "2010-04-01" "2011-04-01" "2012-04-01"
##  [66] "2013-04-01" "2014-04-01" "2015-04-01" "2016-04-01" "2017-04-01"
##  [71] "2008-05-01" "2009-05-01" "2010-05-01" "2011-05-01" "2012-05-01"
##  [76] "2013-05-01" "2014-05-01" "2015-05-01" "2016-05-01" "2017-05-01"
##  [81] "2008-06-01" "2009-06-01" "2010-06-01" "2011-06-01" "2012-06-01"
##  [86] "2013-06-01" "2014-06-01" "2015-06-01" "2016-06-01" "2017-06-01"
##  [91] "2008-07-01" "2009-07-01" "2010-07-01" "2011-07-01" "2012-07-01"
##  [96] "2013-07-01" "2014-07-01" "2015-07-01" "2016-07-01" "2017-07-01"
## [101] "2008-08-01" "2009-08-01" "2010-08-01" "2011-08-01" "2012-08-01"
## [106] "2013-08-01" "2014-08-01" "2015-08-01" "2016-08-01" "2017-08-01"
## [111] "2008-09-01" "2009-09-01" "2010-09-01" "2011-09-01" "2012-09-01"
## [116] "2013-09-01" "2014-09-01" "2015-09-01" "2016-09-01" "2017-09-01"

Working with Fiscal Yr

Government fiscal years differ from calendar years in that they are named for the calendar year where they end. The government fiscal year begins in October. So October 2016 is actually in FY17. We need to convert these to calendar dates before we try to plot them, so we need to find months >=10 and subtract one year from them

# Government fiscal years differ from calendar years in that they are named for the
# calendar year where they end.  The government fiscal year begins in October.
# So October 2016 is actually in FY17.
#
# We need to convert these to calendar dates before we try to plot them, so we 
# need to find months >=10 and subtract one year from them

# Let's find the affected rows
advanced_dates <- which(month(ssa_long$Date)>=10)

# And then decerement the years by one
year(ssa_long$Date[advanced_dates]) <- year(ssa_long$Date[advanced_dates]) - 1

datatable(ssa_long)
# Let's look at where we are
summary(ssa_long)
##  Fiscal_Year           month           application_method  applications   
##  Length:240         Length:240         Length:240         Min.   : 14321  
##  Class :character   Class :character   Class :character   1st Qu.: 91399  
##  Mode  :character   Mode  :character   Mode  :character   Median :145344  
##                                                           Mean   :154322  
##                                                           3rd Qu.:224669  
##                                                           Max.   :300442  
##                                                           NA's   :16      
##       Date           
##  Min.   :2007-10-01  
##  1st Qu.:2010-03-24  
##  Median :2012-09-16  
##  Mean   :2012-09-15  
##  3rd Qu.:2015-03-08  
##  Max.   :2017-09-01  
## 

Analysis

# Remove Fiscal_Year and month columns
ssa_long$Fiscal_Year <- NULL
ssa_long$month <- NULL

# Convert application_method to a factor
ssa_long$application_method <- as.factor(ssa_long$application_method)

# Checking the data summary now.
summary(ssa_long)
##  application_method  applications         Date           
##  Internet:120       Min.   : 14321   Min.   :2007-10-01  
##  Total   :120       1st Qu.: 91399   1st Qu.:2010-03-24  
##                     Median :145344   Median :2012-09-16  
##                     Mean   :154322   Mean   :2012-09-15  
##                     3rd Qu.:224669   3rd Qu.:2015-03-08  
##                     Max.   :300442   Max.   :2017-09-01  
##                     NA's   :16
# Widen the final dataset, moving Appliation Method side by side in a row  for the moth.
ssa <- spread(ssa_long, application_method, applications)

# Data view after spread
datatable(ssa)
# Add a column for the percentage of applications that were online
ssa$online_percentage <- ssa$Internet/ssa$Total*100
ssa$Offline_percentage <- 100 - ssa$online_percentage
ssa <- ssa[which(!is.na(ssa$Total)),]

# Plot the results
ggplot(data=ssa, mapping=aes(x=Date,y=online_percentage)) +
  geom_point()+
  geom_line()+
labs(title = "Type of Applicaiton Flow by Year")

plot_vline <- filter(ssa,ssa$online_percentage> ssa$Offline_percentage)$Date[1]
## Warning: package 'bindrcpp' was built under R version 3.5.2
ggplot(data=ssa, mapping=aes(x=Date)) +
  geom_line(aes(y=Offline_percentage, colour="darkblue"),size=1 )+
  geom_line(aes(y=online_percentage, colour="darkred"),size=1 )+
scale_color_discrete(name = "Type ", labels = c("Offline_percentage","online_percentage"))+
    labs(title = "Type of Applicaiton Flow by Year")+
  geom_vline(xintercept = plot_vline,color="blue")+
  geom_text( mapping=aes(x=plot_vline+1, y=55, label=" Internet  > Offline "), size=4, angle=90, vjust=-0.4, hjust=0) 

Conclusion

Based on the above data points and progress of the Internet application over year , we can say that Internet applications are growing over the year and its helping Gov to reduce the traffic of the offline request.