Clearing the memory

rm(list=ls())

Loading the Data

raw.data<- read.csv("C:\\Users\\nidhi\\OneDrive\\Desktop\\Forecasting Project3\\ks-projects-201801.csv")
dim(raw.data)
## [1] 378661     15
names(raw.data)
##  [1] "ID"               "name"             "category"        
##  [4] "main_category"    "currency"         "deadline"        
##  [7] "goal"             "launched"         "pledged"         
## [10] "state"            "backers"          "country"         
## [13] "usd.pledged"      "usd_pledged_real" "usd_goal_real"
head(raw.data)
##           ID                                                       name
## 1 1000002330                            The Songs of Adelaide & Abullah
## 2 1000003930              Greeting From Earth: ZGAC Arts Capsule For ET
## 3 1000004038                                             Where is Hank?
## 4 1000007540          ToshiCapital Rekordz Needs Help to Complete Album
## 5 1000011046 Community Film Project: The Art of Neighborhood Filmmaking
## 6 1000014025                                       Monarch Espresso Bar
##         category main_category currency   deadline  goal
## 1         Poetry    Publishing      GBP 2015-10-09  1000
## 2 Narrative Film  Film & Video      USD 2017-11-01 30000
## 3 Narrative Film  Film & Video      USD 2013-02-26 45000
## 4          Music         Music      USD 2012-04-16  5000
## 5   Film & Video  Film & Video      USD 2015-08-29 19500
## 6    Restaurants          Food      USD 2016-04-01 50000
##              launched pledged      state backers country usd.pledged
## 1 2015-08-11 12:12:28       0     failed       0      GB           0
## 2 2017-09-02 04:43:57    2421     failed      15      US         100
## 3 2013-01-12 00:20:50     220     failed       3      US         220
## 4 2012-03-17 03:24:11       1     failed       1      US           1
## 5 2015-07-04 08:35:03    1283   canceled      14      US        1283
## 6 2016-02-26 13:38:27   52375 successful     224      US       52375
##   usd_pledged_real usd_goal_real
## 1                0       1533.95
## 2             2421      30000.00
## 3              220      45000.00
## 4                1       5000.00
## 5             1283      19500.00
## 6            52375      50000.00
str(raw.data)
## 'data.frame':    378661 obs. of  15 variables:
##  $ ID              : int  1000002330 1000003930 1000004038 1000007540 1000011046 1000014025 1000023410 1000030581 1000034518 100004195 ...
##  $ name            : Factor w/ 375765 levels "","\177Not Twins - New EP! \"The View from Down Here\"",..: 332541 135689 365010 344805 77349 206130 293462 69360 284139 290718 ...
##  $ category        : Factor w/ 159 levels "3D Printing",..: 109 94 94 91 56 124 59 42 114 40 ...
##  $ main_category   : Factor w/ 15 levels "Art","Comics",..: 13 7 7 11 7 8 8 8 5 7 ...
##  $ currency        : Factor w/ 14 levels "AUD","CAD","CHF",..: 6 14 14 14 14 14 14 14 14 14 ...
##  $ deadline        : Factor w/ 3164 levels "2009-05-03","2009-05-16",..: 2288 3042 1333 1017 2247 2463 1996 2448 1790 1863 ...
##  $ goal            : num  1000 30000 45000 5000 19500 50000 1000 25000 125000 65000 ...
##  $ launched        : Factor w/ 378089 levels "1970-01-01 01:00:00",..: 243292 361975 80409 46557 235943 278600 187500 274014 139367 153766 ...
##  $ pledged         : num  0 2421 220 1 1283 ...
##  $ state           : Factor w/ 6 levels "canceled","failed",..: 2 2 2 2 1 4 4 2 1 1 ...
##  $ backers         : int  0 15 3 1 14 224 16 40 58 43 ...
##  $ country         : Factor w/ 23 levels "AT","AU","BE",..: 10 23 23 23 23 23 23 23 23 23 ...
##  $ usd.pledged     : num  0 100 220 1 1283 ...
##  $ usd_pledged_real: num  0 2421 220 1 1283 ...
##  $ usd_goal_real   : num  1534 30000 45000 5000 19500 ...
#Libaries Required

library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.1
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(fpp2)
## Warning: package 'fpp2' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## Loading required package: forecast
## Warning: package 'forecast' was built under R version 3.5.2
## Loading required package: fma
## Warning: package 'fma' was built under R version 3.5.2
## Loading required package: expsmooth
## Warning: package 'expsmooth' was built under R version 3.5.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Data Cleaning
raw.data$ID <- as.character(raw.data$ID)
raw.data$name <- as.character(raw.data$name)
raw.data$deadline <- as.Date(raw.data$deadline)
raw.data$launched <- as.Date(raw.data$launched)


prcd.data<- raw.data %>%
  mutate(deadline_year = year(deadline), 
                deadline_month = month(deadline), 
                deadline_day = day(deadline),
         deadline_YrMth=format(as.Date(raw.data$deadline), "%Y%m") )
## Warning: package 'bindrcpp' was built under R version 3.5.1
prcd.data<- prcd.data %>%
  mutate(launched_year = year(launched), 
                launched_month = month(launched), 
                launched_day = day(launched),
         launched_YrMth=format(as.Date(raw.data$launched), "%Y%m"))

#Focus on Failed and Successful

Fnl.data <- prcd.data %>% filter(state == "failed" | state == "successful") 
Fnl.data$state <- as.character(Fnl.data$state)
Fnl.data$state <- as.factor(Fnl.data$state)
summary(Fnl.data$state)
##     failed successful 
##     197719     133956
#Plot the number of projects in each states of the projects
ggplot(prcd.data, aes(state)) +
  geom_bar(fill = "#FF6666") +
  ylab("# of Projects") + xlab("Final State") +
  ggtitle("Final State of the Kickstarter projects")

#State of projects in each year based on deadline 
ggplot(prcd.data, aes(x=deadline_year,fill=factor(state))) +
  geom_bar() +
  #theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
  #scale_x_discrete(breaks=prcd.data$deadline_year)+ 
  ylab("No of Projects") + xlab("Year") +
  ggtitle("State of the Kickstarter projects")

#Failed and Successful projects in each year based on deadline 
State.by.year.smry <- as.data.frame(table(Fnl.data$deadline_year, Fnl.data$state))
names(State.by.year.smry) <- c('deadline_Year','State', 'Freq')
State.by.year.smry <- State.by.year.smry %>% group_by(deadline_Year) %>% arrange(desc(Freq))

ggplot(State.by.year.smry, aes(x = deadline_Year, y = `Freq`, fill = State)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Failed & Successful Projects deadline over time") +
  scale_y_continuous("# of Projects deadline", labels = scales::comma) +
  scale_x_discrete("Year")  

# Plot Total Projects by all the different State

state.by.tot <- as.data.frame.matrix((as.matrix(table(raw.data$state))))
rw.names.state.by.tot<- row.names(state.by.tot)
state.by.tot <-cbind(rw.names.state.by.tot,state.by.tot)
row.names(state.by.tot) <- NULL
names(state.by.tot)<- c("State","Value")

ggplot(state.by.tot, aes('', Value, fill = State)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (Value/sum(Value))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "State", title = "Total Projects by Failed/Successful State")

state.by.tot <- as.data.frame.matrix((as.matrix(table(Fnl.data$state))))
rw.names.state.by.tot<- row.names(state.by.tot)
state.by.tot <-cbind(rw.names.state.by.tot,state.by.tot)
row.names(state.by.tot) <- NULL
names(state.by.tot)<- c("State","Value")

# Plot Total Projects by Failed /Successful State
ggplot(state.by.tot, aes('', Value, fill = State)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (Value/sum(Value))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "State", title = "Total Projects by Failed/Successful State")

# Identify the success project,failed projects,successrate,failedrate at year level
state_by_year <- as.data.frame.matrix( t(as.matrix(table(Fnl.data$state,Fnl.data$deadline_year))))
rw.names.state.yr<- row.names(state_by_year)
state_by_year<- as.data.frame((state_by_year))
state_by_year <-cbind(rw.names.state.yr,state_by_year)
row.names(state_by_year) <- NULL
names(state_by_year)[1]<- "Year"
head(state_by_year)
##   Year failed successful
## 1 2009    410        384
## 2 2010   4297       4008
## 3 2011  11201      11768
## 4 2012  20760      18063
## 5 2013  21346      19198
## 6 2014  37038      21202
state_by_year <- state_by_year %>% mutate(
        successrate= successful/(successful+failed),
        failurerate= failed/(successful+failed)
        )


p1 <-ggplot(state_by_year, aes(Year,successrate))
p1<-p1 +geom_bar(stat = "identity", aes(fill = successrate))

p2 <-ggplot(state_by_year, aes(Year,failurerate))
p2<- p2 +geom_bar(stat = "identity", aes(fill = failurerate))


grid.arrange(p1, p2, nrow=2)

# Identify the success project,failed projects,successrate,failedrate at yearmonth level
head(Fnl.data)
##           ID                                                         name
## 1 1000002330                              The Songs of Adelaide & Abullah
## 2 1000003930                Greeting From Earth: ZGAC Arts Capsule For ET
## 3 1000004038                                               Where is Hank?
## 4 1000007540            ToshiCapital Rekordz Needs Help to Complete Album
## 5 1000014025                                         Monarch Espresso Bar
## 6 1000023410 Support Solar Roasted Coffee & Green Energy!  SolarCoffee.co
##         category main_category currency   deadline  goal   launched
## 1         Poetry    Publishing      GBP 2015-10-09  1000 2015-08-11
## 2 Narrative Film  Film & Video      USD 2017-11-01 30000 2017-09-02
## 3 Narrative Film  Film & Video      USD 2013-02-26 45000 2013-01-12
## 4          Music         Music      USD 2012-04-16  5000 2012-03-17
## 5    Restaurants          Food      USD 2016-04-01 50000 2016-02-26
## 6           Food          Food      USD 2014-12-21  1000 2014-12-01
##   pledged      state backers country usd.pledged usd_pledged_real
## 1       0     failed       0      GB           0                0
## 2    2421     failed      15      US         100             2421
## 3     220     failed       3      US         220              220
## 4       1     failed       1      US           1                1
## 5   52375 successful     224      US       52375            52375
## 6    1205 successful      16      US        1205             1205
##   usd_goal_real deadline_year deadline_month deadline_day deadline_YrMth
## 1       1533.95          2015             10            9         201510
## 2      30000.00          2017             11            1         201711
## 3      45000.00          2013              2           26         201302
## 4       5000.00          2012              4           16         201204
## 5      50000.00          2016              4            1         201604
## 6       1000.00          2014             12           21         201412
##   launched_year launched_month launched_day launched_YrMth
## 1          2015              8           11         201508
## 2          2017              9            2         201709
## 3          2013              1           12         201301
## 4          2012              3           17         201203
## 5          2016              2           26         201602
## 6          2014             12            1         201412
state_by_mth <- as.data.frame.matrix( t(as.matrix(table(Fnl.data$state,Fnl.data$deadline_YrMth))))
rw.names.state.mth<- row.names(state_by_mth)
state_by_mth<- as.data.frame((state_by_mth))
state_by_mth <-cbind(rw.names.state.mth,state_by_mth)
row.names(state_by_mth) <- NULL
names(state_by_mth)[1]<- "YrMonth"
head(state_by_mth)
##   YrMonth failed successful
## 1  200905      8          3
## 2  200906     12         16
## 3  200907     21         38
## 4  200908     29         33
## 5  200909     38         41
## 6  200910     60         65
state_by_mth <- state_by_mth %>% mutate(
        successrate= successful/(successful+failed),
        failurerate= failed/(successful+failed)
        )


p1 <-ggplot(state_by_mth, aes(YrMonth,successrate))
p1<-p1 +geom_bar(stat = "identity", aes(fill = successrate))

p2 <-ggplot(state_by_mth, aes(YrMonth,failurerate))
p2<- p2 +geom_bar(stat = "identity", aes(fill = failurerate))


grid.arrange(p1, p2, nrow=2)

#Failed and Successful projects in each Category  

names(Fnl.data)
##  [1] "ID"               "name"             "category"        
##  [4] "main_category"    "currency"         "deadline"        
##  [7] "goal"             "launched"         "pledged"         
## [10] "state"            "backers"          "country"         
## [13] "usd.pledged"      "usd_pledged_real" "usd_goal_real"   
## [16] "deadline_year"    "deadline_month"   "deadline_day"    
## [19] "deadline_YrMth"   "launched_year"    "launched_month"  
## [22] "launched_day"     "launched_YrMth"
State.by.category.smry <- as.data.frame(table(Fnl.data$main_category, Fnl.data$state))
names(State.by.category.smry) <- c('main_category','State', 'Freq')
State.by.category.smry <- State.by.category.smry %>% group_by(main_category) %>% arrange(desc(Freq))

Failed.by.category<- State.by.category.smry %>% filter(State=="failed")
ggplot(Failed.by.category, aes(x = reorder(main_category,-Freq), y = `Freq`, fill = State)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ggtitle("Total Projects in Different Categories by Failed State") +
  scale_y_continuous("# of Projects", labels = scales::comma) +
  scale_x_discrete("Main Categories")  

Successful.by.category<- State.by.category.smry %>% filter(State=="successful")
ggplot(Successful.by.category, aes(x = reorder(main_category,-Freq), y = `Freq`, fill = State)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ggtitle("Total Projects in Different Categories by Successful State") +
  scale_y_continuous("# of Projects", labels = scales::comma) +
  scale_x_discrete("Main Categories")  

#Failed and Successful projects in each Category  

names(Fnl.data)
##  [1] "ID"               "name"             "category"        
##  [4] "main_category"    "currency"         "deadline"        
##  [7] "goal"             "launched"         "pledged"         
## [10] "state"            "backers"          "country"         
## [13] "usd.pledged"      "usd_pledged_real" "usd_goal_real"   
## [16] "deadline_year"    "deadline_month"   "deadline_day"    
## [19] "deadline_YrMth"   "launched_year"    "launched_month"  
## [22] "launched_day"     "launched_YrMth"
State.by.category.year<- as.data.frame(table(Fnl.data$main_category,Fnl.data$deadline_year, Fnl.data$state))
names(State.by.category.year) <- c('main_category','Yr','State', 'Freq')
State.by.category.year <- State.by.category.year %>% group_by(main_category) %>% arrange(desc(Freq))

Failed.by.category.year<- State.by.category.year %>% filter(State=="failed") %>% 
  arrange(desc(Freq)) %>%
  group_by(Yr) %>%
  filter(row_number() <= 5L)

p1<- ggplot(Failed.by.category.year, aes(x = Yr, y = `Freq`, fill = main_category)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ggtitle("Total Projects in Different Categories by Failed State by year") +
  scale_y_continuous("# of Projects", labels = scales::comma) +
  scale_x_discrete("Year")  

Successful.by.category.year<- State.by.category.year %>% filter(State=="successful") %>% 
  arrange(desc(Freq)) %>%
  group_by(Yr) %>%
  filter(row_number() <= 5L)

p2<- ggplot(Successful.by.category.year, aes(x = Yr, y = `Freq`, fill = main_category)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  ggtitle("Total Projects in Different Categories by Successful State by year") +
  scale_y_continuous("# of Projects", labels = scales::comma) +
  scale_x_discrete("Year")  


grid.arrange(p1,p2,nrow=2)

ggplot(Fnl.data, aes(deadline_year, fill = Fnl.data$state)) +
  geom_bar() +
  theme(legend.position = "bottom") +
  facet_wrap( ~ main_category) +
  ylab("Number of Projects") + xlab("Launched Year") +
  ggtitle("KS projects launched over time by Category")

#Create time series
names(state_by_mth)
## [1] "YrMonth"     "failed"      "successful"  "successrate" "failurerate"
head(state_by_mth)
##   YrMonth failed successful successrate failurerate
## 1  200905      8          3   0.2727273   0.7272727
## 2  200906     12         16   0.5714286   0.4285714
## 3  200907     21         38   0.6440678   0.3559322
## 4  200908     29         33   0.5322581   0.4677419
## 5  200909     38         41   0.5189873   0.4810127
## 6  200910     60         65   0.5200000   0.4800000
state_by_mth %>% arrange(desc(YrMonth))
##     YrMonth failed successful successrate failurerate
## 1    201801    134         86   0.3909091   0.6090909
## 2    201712   2277       1714   0.4294663   0.5705337
## 3    201711   2203       1789   0.4481463   0.5518537
## 4    201710   2139       1652   0.4357689   0.5642311
## 5    201709   2104       1483   0.4134374   0.5865626
## 6    201708   2237       1592   0.4157744   0.5842256
## 7    201707   2442       1630   0.4002947   0.5997053
## 8    201706   2432       1652   0.4045054   0.5954946
## 9    201705   2362       1754   0.4261419   0.5738581
## 10   201704   2337       1809   0.4363242   0.5636758
## 11   201703   2489       1905   0.4335457   0.5664543
## 12   201702   2026       1370   0.4034158   0.5965842
## 13   201701   2006        998   0.3322237   0.6677763
## 14   201612   2426       1706   0.4128751   0.5871249
## 15   201611   2284       1738   0.4321233   0.5678767
## 16   201610   2295       1585   0.4085052   0.5914948
## 17   201609   2195       1392   0.3880680   0.6119320
## 18   201608   2331       1443   0.3823529   0.6176471
## 19   201607   2766       1652   0.3739249   0.6260751
## 20   201606   2788       1627   0.3685164   0.6314836
## 21   201605   2952       1871   0.3879328   0.6120672
## 22   201604   2837       1734   0.3793481   0.6206519
## 23   201603   3126       1800   0.3654080   0.6345920
## 24   201602   2580       1276   0.3309129   0.6690871
## 25   201601   2521       1000   0.2840102   0.7159898
## 26   201512   3311       1771   0.3484848   0.6515152
## 27   201511   3139       1777   0.3614727   0.6385273
## 28   201510   3297       1716   0.3423100   0.6576900
## 29   201509   3339       1477   0.3066860   0.6933140
## 30   201508   3528       1644   0.3178654   0.6821346
## 31   201507   4038       1938   0.3242972   0.6757028
## 32   201506   3883       1849   0.3225750   0.6774250
## 33   201505   4435       2087   0.3199939   0.6800061
## 34   201504   4630       2160   0.3181149   0.6818851
## 35   201503   4763       2112   0.3072000   0.6928000
## 36   201502   3379       1416   0.2953076   0.7046924
## 37   201501   2636       1128   0.2996812   0.7003188
## 38   201412   3629       1823   0.3343727   0.6656273
## 39   201411   3720       1906   0.3387842   0.6612158
## 40   201410   3840       1936   0.3351801   0.6648199
## 41   201409   4687       1797   0.2771437   0.7228563
## 42   201408   6450       2207   0.2549382   0.7450618
## 43   201407   2960       1823   0.3811415   0.6188585
## 44   201406   2305       1722   0.4276136   0.5723864
## 45   201405   2288       1865   0.4490730   0.5509270
## 46   201404   2074       1845   0.4707834   0.5292166
## 47   201403   1993       1830   0.4786817   0.5213183
## 48   201402   1519       1260   0.4534005   0.5465995
## 49   201401   1573       1188   0.4302789   0.5697211
## 50   201312   2101       1675   0.4435911   0.5564089
## 51   201311   1900       1639   0.4631252   0.5368748
## 52   201310   1904       1549   0.4485954   0.5514046
## 53   201309   1673       1421   0.4592760   0.5407240
## 54   201308   1847       1693   0.4782486   0.5217514
## 55   201307   2022       1738   0.4622340   0.5377660
## 56   201306   1883       1779   0.4858001   0.5141999
## 57   201305   1918       1874   0.4941983   0.5058017
## 58   201304   1766       1768   0.5002830   0.4997170
## 59   201303   1636       1823   0.5270309   0.4729691
## 60   201302   1425       1221   0.4614512   0.5385488
## 61   201301   1271       1018   0.4447357   0.5552643
## 62   201212   1635       1332   0.4489383   0.5510617
## 63   201211   1605       1385   0.4632107   0.5367893
## 64   201210   1633       1442   0.4689431   0.5310569
## 65   201209   1766       1437   0.4486419   0.5513581
## 66   201208   1872       1651   0.4686347   0.5313653
## 67   201207   1948       1754   0.4737979   0.5262021
## 68   201206   2056       1697   0.4521716   0.5478284
## 69   201205   2064       1769   0.4615184   0.5384816
## 70   201204   1920       1781   0.4812213   0.5187787
## 71   201203   1759       1662   0.4858229   0.5141771
## 72   201202   1259       1110   0.4685521   0.5314479
## 73   201201   1243       1043   0.4562555   0.5437445
## 74   201112   1268       1125   0.4701212   0.5298788
## 75   201111    890        988   0.5260916   0.4739084
## 76   201110   1013       1060   0.5113362   0.4886638
## 77   201109   1086       1052   0.4920486   0.5079514
## 78   201108   1165       1231   0.5137730   0.4862270
## 79   201107   1075       1095   0.5046083   0.4953917
## 80   201106    999       1038   0.5095729   0.4904271
## 81   201105   1002       1119   0.5275813   0.4724187
## 82   201104    815       1044   0.5615923   0.4384077
## 83   201103    748        858   0.5342466   0.4657534
## 84   201102    582        601   0.5080304   0.4919696
## 85   201101    558        557   0.4995516   0.5004484
## 86   201012    557        555   0.4991007   0.5008993
## 87   201011    500        489   0.4944388   0.5055612
## 88   201010    533        462   0.4643216   0.5356784
## 89   201009    539        451   0.4555556   0.5444444
## 90   201008    563        465   0.4523346   0.5476654
## 91   201007    566        435   0.4345654   0.5654346
## 92   201006    353        331   0.4839181   0.5160819
## 93   201005    243        262   0.5188119   0.4811881
## 94   201004    140        203   0.5918367   0.4081633
## 95   201003    116        147   0.5589354   0.4410646
## 96   201002     94        104   0.5252525   0.4747475
## 97   201001     93        104   0.5279188   0.4720812
## 98   200912    147         98   0.4000000   0.6000000
## 99   200911     95         90   0.4864865   0.5135135
## 100  200910     60         65   0.5200000   0.4800000
## 101  200909     38         41   0.5189873   0.4810127
## 102  200908     29         33   0.5322581   0.4677419
## 103  200907     21         38   0.6440678   0.3559322
## 104  200906     12         16   0.5714286   0.4285714
## 105  200905      8          3   0.2727273   0.7272727
success.ts <- ts(state_by_mth[,3], frequency=12,start=c(2009,5))
autoplot(success.ts)

failed.ts <- ts(state_by_mth[,2], frequency=12,start=c(2009,5))
autoplot(failed.ts)

ggsubseriesplot(success.ts)

ggsubseriesplot(failed.ts)

#Analysis of projects based on backers count
prj.Bkt<- Fnl.data %>%
  select(state,backers) %>% filter(state=="failed" | state== "successful") %>%
  mutate(greater_1000_backers =ifelse(backers >=1000,1,0 ),less_1000_backers=ifelse(backers< 1000,1,0) ) %>%
  arrange(desc(state)) %>%  group_by(state) %>% summarise_all(funs(sum))

prj.bkrs.Plot<- as.data.frame(((apply(prj.Bkt[,3:4],2,sum))))
names(prj.bkrs.Plot)[1] <- "count"
backers_category<- rownames(prj.bkrs.Plot) 

prj.bkrs.Plot <- cbind(backers_category,prj.bkrs.Plot)
rownames(prj.bkrs.Plot) <- NULL


ggplot(prj.bkrs.Plot, aes('', count, fill = backers_category)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (count/sum(count))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "Backers Category", title = "Projects based on backers counts")

#Analysis of backers influence on projects if count is greater than 1000
grt.bkrs.Plot <- prj.Bkt[, c("state","greater_1000_backers")]


ggplot(grt.bkrs.Plot, aes('', greater_1000_backers, fill = state)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (greater_1000_backers/sum(greater_1000_backers))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if backers > 1000")

#Analysis of backers influence on projects if count is less than 1000
less.bkrs.Plot <- prj.Bkt[, c("state","less_1000_backers")]


ggplot(less.bkrs.Plot, aes('', less_1000_backers, fill = state)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (less_1000_backers/sum(less_1000_backers))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if backers < 1000")

#Analysis of projects based on goal count
goal.prj.Bkt<- Fnl.data %>%
  select(state,goal) %>% filter(state=="failed" | state== "successful") %>%
  mutate(greater_1000_goal =ifelse(goal >=100,1,0 ) ,less_1000_goal=ifelse(goal< 1000,1,0)) %>%
  arrange(desc(state)) %>%  group_by(state) %>% summarise_all(funs(sum))

prj.goal.Plot<- as.data.frame(((apply(goal.prj.Bkt[,3:4],2,sum))))
names(prj.goal.Plot)[1] <- "count"
goal_category<- rownames(prj.goal.Plot) 

prj.goal.Plot <- cbind(goal_category,prj.goal.Plot)
rownames(prj.goal.Plot) <- NULL


ggplot(prj.goal.Plot, aes('', count, fill = goal_category)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (count/sum(count))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "Goal Category", title = "Projects based on goal counts")

#Analysis of goal influence on projects if count is greater than 1000
grt.goal.Plot <- goal.prj.Bkt[, c("state","greater_1000_goal")]


ggplot(grt.goal.Plot, aes('', greater_1000_goal, fill = state)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (greater_1000_goal/sum(greater_1000_goal))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if goal > 1000")

#Analysis of goal influence on projects if count is less than 1000
less.goal.Plot <- goal.prj.Bkt[, c("state","less_1000_goal")]


ggplot(less.goal.Plot, aes('', less_1000_goal, fill = state)) + 
  geom_col(position = 'fill') +
  geom_label(aes(label = paste(format(round( (less_1000_goal/sum(less_1000_goal))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
  coord_polar(theta = 'y')+
  labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if goal < 1000")

#Required data for time series creation
failed_req_var <- Fnl.data  %>% select(state,deadline_YrMth,goal,usd_pledged_real,backers) %>%  filter(state=="failed") %>% group_by(state,deadline_YrMth) %>% summarise_all(funs(sum)  )
failed_req_var <- failed_req_var[,2:5]
names(failed_req_var)[2] <- "failed_goal"
names(failed_req_var)[3] <- "failed_Usd_pledged_real"
names(failed_req_var)[4] <- "failed_backer"
failed_req_var$deadline_YrMth <- as.factor(failed_req_var$deadline_YrMth)
Data.for.ts<- state_by_mth %>% inner_join(failed_req_var, by = c("YrMonth" = "deadline_YrMth"))


successful_req_var <- Fnl.data  %>% select(state,deadline_YrMth,goal,usd_pledged_real,backers) %>%  filter(state=="successful") %>% group_by(state,deadline_YrMth) %>% summarise_all(funs(sum)  )
successful_req_var <- successful_req_var[,2:5]
names(successful_req_var)[2] <- "successful_goal"
names(successful_req_var)[3] <- "successful_Usd_pledged_real"
names(successful_req_var)[4] <- "successful_backer"
successful_req_var$deadline_YrMth <- as.factor(successful_req_var$deadline_YrMth)
Data.for.ts<- Data.for.ts %>% inner_join(successful_req_var, by = c("YrMonth" = "deadline_YrMth"))
#Splitting the data into training and test data
kck.str.ts <- ts(Data.for.ts[,2:11], frequency=12,start=c(2009,5),end=c(2017,12))
kck.str.train.ts<- window(kck.str.ts,end=c(2016,12))
kck.str.test.ts<- window(kck.str.ts,start=c(2017,01),end=c(2017,12))
#Decompose Success State Time Series 

Success.decomp <- stl(kck.str.ts[,2],s.window = "periodic")
autoplot(Success.decomp) + ggtitle("Decomposition of Successful Projects over time")

#Number of Successful Project forecast using ETS Method

success.ets.fst <- ets(kck.str.train.ts[,2])
success.ets.Pred <- forecast( success.ets.fst, h=length(kck.str.test.ts[,2]))
round(accuracy(success.ets.Pred,kck.str.test.ts[,2]),3)
##                    ME    RMSE     MAE     MPE   MAPE  MASE   ACF1
## Training set    3.644 224.055 142.464  -0.016 12.502 0.450 -0.020
## Test set     -185.455 284.993 219.313 -14.309 16.108 0.692  0.302
##              Theil's U
## Training set        NA
## Test set         0.893
#Check the residuals of the ets method 
checkresiduals(success.ets.fst)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,N)
## Q* = 88.09, df = 14.4, p-value = 1.278e-12
## 
## Model df: 4.   Total lags used: 18.4
autoplot(kck.str.train.ts[,2],series = "Data" ) +  
  autolayer(fitted(success.ets.fst), series = "Fitted") +
  autolayer(success.ets.Pred, PI = TRUE, series = "ETS") +
   xlab("Year") + ylab("No of Projects") +
  ggtitle("No of Successful Projects Forecast using ETS Method") + 
  guides(colour = guide_legend(title = "Model"))

#Number of Successful Project forecast using ARIMA Method


success.arima.fst <- auto.arima(kck.str.train.ts[,2] )
success.arima.Pred <- forecast( success.arima.fst, h=length(kck.str.test.ts[,2]))
#Accuracy 
round(accuracy(success.arima.Pred,kck.str.test.ts[,2]),3)
##                  ME    RMSE    MAE    MPE  MAPE  MASE   ACF1 Theil's U
## Training set -4.538 130.130 87.362 -0.156 6.520 0.276 -0.066        NA
## Test set     68.891  95.157 85.882  4.393 5.371 0.271  0.134     0.407
#Plot the model using Arima
autoplot(success.arima.Pred)+
     xlab("Year") + ylab("No of Projects") +
  ggtitle("No of Successful Projects Forecast using ARIMA") + 
  guides(colour = guide_legend(title = "Model"))

#Check the residuals of the ARIMA method 
checkresiduals(success.arima.Pred)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)(0,1,1)[12]
## Q* = 25.499, df = 17.4, p-value = 0.09453
## 
## Model df: 1.   Total lags used: 18.4
prediction=predict(success.arima.fst,n.ahead=12)


#Plot the data vs fitted vs validation set using ets method
autoplot(kck.str.train.ts[,2],series = "Data" ) +  
  autolayer(fitted(success.arima.Pred), series = "Fitted") +
  autolayer(success.arima.Pred, PI = TRUE, series = "ARIMA") +
   xlab("Year") + ylab("No of Projects") +
  ggtitle("No of Successful Projects Forecast using ARIMA Method") + 
  guides(colour = guide_legend(title = "Model"))

success.arima.fst %>% forecast(h=12) %>% autoplot()

#Check the residuals of the ets method 
checkresiduals(success.arima.fst)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)(0,1,1)[12]
## Q* = 25.499, df = 17.4, p-value = 0.09453
## 
## Model df: 1.   Total lags used: 18.4
summary(success.arima.fst)
## Series: kck.str.train.ts[, 2] 
## ARIMA(0,1,0)(0,1,1)[12] 
## 
## Coefficients:
##          sma1
##       -0.2206
## s.e.   0.1066
## 
## sigma^2 estimated as 19973:  log likelihood=-503.03
## AIC=1010.05   AICc=1010.21   BIC=1014.79
## 
## Training set error measures:
##                     ME     RMSE      MAE        MPE     MAPE     MASE
## Training set -4.537775 130.1305 87.36204 -0.1557508 6.520436 0.275764
##                     ACF1
## Training set -0.06605202
#Correlation of required variables 

kck.str.ts[,c("successful","successful_backer","successful_goal","successful_Usd_pledged_real")] %>%
  as.data.frame() %>%
  GGally::ggpairs()

#Number of successful projects using dynamic regression 

xreg <- cbind(successful_goal = kck.str.train.ts[, "successful_goal"],
              successful_backer = kck.str.train.ts[, "successful_backer"],
              successful_Usd_pledged_real=  kck.str.train.ts[,"successful_Usd_pledged_real"])

success.dym.arima.fst <- auto.arima(kck.str.train.ts[,"successful"],
  xreg=xreg,seasonal = TRUE,approximation = FALSE, stepwise = FALSE)


checkresiduals(success.dym.arima.fst)

## 
##  Ljung-Box test
## 
## data:  Residuals from Regression with ARIMA(1,0,0)(1,0,0)[12] errors
## Q* = 13.649, df = 12.4, p-value = 0.3534
## 
## Model df: 6.   Total lags used: 18.4
success.dym.arima.pred <- forecast(success.dym.arima.fst, xreg=
                                     cbind( rep(mean(kck.str.ts[,"successful_goal"]),12), 
                                            rep(mean(kck.str.ts[,"successful_backer"]),12),
                                                rep(mean(kck.str.ts[,"successful_Usd_pledged_real"]),12)
                                            ) )
## Warning in forecast.Arima(success.dym.arima.fst, xreg =
## cbind(rep(mean(kck.str.ts[, : xreg contains different column names from
## the xreg used in training. Please check that the regressors are in the same
## order.
round(accuracy(success.dym.arima.pred,kck.str.test.ts[,2]),3)
##                   ME    RMSE     MAE     MPE   MAPE  MASE   ACF1 Theil's U
## Training set  12.385 118.864  84.452 -64.497 72.999 0.267 -0.147        NA
## Test set     392.816 428.058 395.601  23.105 23.384 1.249  0.293     1.604
summary(success.dym.arima.pred)
## 
## Forecast method: Regression with ARIMA(1,0,0)(1,0,0)[12] errors
## 
## Model Information:
## Series: kck.str.train.ts[, "successful"] 
## Regression with ARIMA(1,0,0)(1,0,0)[12] errors 
## 
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
##          ar1    sar1  intercept  successful_goal  successful_backer
##       0.9061  0.5852    607.362                0             -2e-04
## s.e.  0.0383  0.0373        NaN              NaN              2e-04
##       successful_Usd_pledged_real
##                                 0
## s.e.                          NaN
## 
## sigma^2 estimated as 15114:  log likelihood=-573.69
## AIC=1161.38   AICc=1162.71   BIC=1179.03
## 
## Error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE
## Training set 12.38458 118.8638 84.45157 -64.49652 72.9986 0.2665769
##                    ACF1
## Training set -0.1468526
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80    Lo 95    Hi 95
## Jan 2017       1014.709  857.1546 1172.263 773.7503 1255.668
## Feb 2017       1203.315  990.7029 1415.928 878.1527 1528.478
## Mar 2017       1330.834 1081.9629 1579.706 950.2183 1711.451
## Apr 2017       1299.894 1024.8027 1574.985 879.1781 1720.610
## May 2017       1327.804 1032.9233 1622.685 876.8228 1778.785
## Jun 2017       1179.667  869.4814 1489.854 705.2788 1654.056
## Jul 2017       1228.051  905.8417 1550.260 735.2746 1720.827
## Aug 2017       1204.053  872.2980 1535.807 696.6777 1711.428
## Sep 2017       1232.897  893.5055 1572.288 713.8427 1751.951
## Oct 2017       1205.324  859.7888 1550.859 676.8737 1733.774
## Nov 2017       1280.788  930.2891 1631.286 744.7464 1816.829
## Dec 2017       1126.866  772.3446 1481.388 584.6721 1669.061
autoplot(success.dym.arima.pred) + xlab("Year") +
  ylab("No of successful projects")

#Plot the data vs fitted vs validation set using dynamic regression method
autoplot(kck.str.train.ts[,2],series = "Data" ) +  
  autolayer(fitted(success.dym.arima.pred), series = "Fitted") +
  autolayer(success.dym.arima.pred, PI = TRUE, series = "Dynamic Regression with ARIMA") +
   xlab("Year") + ylab("No of Projects") +
  ggtitle("No of Successful Projects Forecast using Dynamic Regresssion Method") + 
  guides(colour = guide_legend(title = "Model"))

#Forecasting of all the models
autoplot(kck.str.train.ts[,2],series = "Data" ) +
  autolayer(success.dym.arima.pred, PI = FALSE, series = "DYNAMIC REGRESSION ARIMA") +
  autolayer(success.arima.Pred, PI = FALSE, series = " ARIMA") +
  autolayer(success.ets.Pred, PI = FALSE, series = " ETS") +
  xlab("Year") + ylab("No of Projects") +
  ggtitle("No of Successful Projects Forecast") + 
  guides(colour = guide_legend(title = "Model"))

#Get the metrics required to select the model
round(accuracy(success.ets.Pred,kck.str.test.ts[,2]),3)[,2]
## Training set     Test set 
##      224.055      284.993
round(accuracy(success.arima.Pred,kck.str.test.ts[,2]),3)[,2]
## Training set     Test set 
##      130.130       95.157
round(accuracy(success.dym.arima.pred,kck.str.test.ts[,2]),3)[,2]
## Training set     Test set 
##      118.864      428.058
success.ets.fst$aic
## [1] 1348.611
success.ets.fst$aicc
## [1] 1349.309
success.arima.fst$aic
## [1] 1010.055
success.arima.fst$aicc
## [1] 1010.212
success.dym.arima.fst$aic
## [1] 1161.379
success.dym.arima.fst$aicc
## [1] 1162.712
summary(success.arima.Pred)[,1]
## 
## Forecast method: ARIMA(0,1,0)(0,1,1)[12]
## 
## Model Information:
## Series: kck.str.train.ts[, 2] 
## ARIMA(0,1,0)(0,1,1)[12] 
## 
## Coefficients:
##          sma1
##       -0.2206
## s.e.   0.1066
## 
## sigma^2 estimated as 19973:  log likelihood=-503.03
## AIC=1010.05   AICc=1010.21   BIC=1014.79
## 
## Error measures:
##                     ME     RMSE      MAE        MPE     MAPE     MASE
## Training set -4.537775 130.1305 87.36204 -0.1557508 6.520436 0.275764
##                     ACF1
## Training set -0.06605202
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Jan 2017       964.3314  783.2134 1145.449  687.3353 1241.327
## Feb 2017      1233.5371  977.3975 1489.677  841.8054 1625.269
## Mar 2017      1789.3924 1475.6868 2103.098 1309.6211 2269.164
## Apr 2017      1746.6127 1384.3767 2108.849 1192.6205 2300.605
## May 2017      1842.5055 1437.5134 2247.498 1223.1235 2461.888
## Jun 2017      1605.0312 1161.3845 2048.678  926.5322 2283.530
## Jul 2017      1643.4428 1164.2496 2122.636  910.5801 2376.306
## Aug 2017      1444.0499  931.7709 1956.329  660.5867 2227.513
## Sep 2017      1357.2985  813.9445 1900.653  526.3103 2188.287
## Oct 2017      1555.1727  982.4273 2127.918  679.2342 2431.111
## Nov 2017      1684.3969 1083.6965 2285.097  765.7048 2603.089
## Dec 2017      1655.5415 1028.1303 2282.953  695.9989 2615.084
##  [1]  964.3314 1233.5371 1789.3924 1746.6127 1842.5055 1605.0312 1643.4428
##  [8] 1444.0499 1357.2985 1555.1727 1684.3969 1655.5415
autoplot(success.arima.Pred$mean)

plot.fst.final<- as.data.frame(as.matrix(success.arima.Pred$mean) , stringsAsFactors = FALSE)
names(plot.fst.final)[1] <- "Forecast"
row.names(plot.fst.final)
##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12"
rownames<- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
plot.fst <- cbind(rownames,plot.fst.final)
plot.fst$rownames <- factor(plot.fst$rownames,levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
str(plot.fst)
## 'data.frame':    12 obs. of  2 variables:
##  $ rownames: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Forecast: num  964 1234 1789 1747 1843 ...
ggplot(data=plot.fst, aes(x=rownames,y=Forecast))+
  geom_point()+
 xlab("Month") + ylab("No of Projects") +
  ggtitle("Estimation of Successful Projects for next year")