Loading the Data
raw.data<- read.csv("C:\\Users\\nidhi\\OneDrive\\Desktop\\Forecasting Project3\\ks-projects-201801.csv")
dim(raw.data)
## [1] 378661 15
names(raw.data)
## [1] "ID" "name" "category"
## [4] "main_category" "currency" "deadline"
## [7] "goal" "launched" "pledged"
## [10] "state" "backers" "country"
## [13] "usd.pledged" "usd_pledged_real" "usd_goal_real"
head(raw.data)
## ID name
## 1 1000002330 The Songs of Adelaide & Abullah
## 2 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET
## 3 1000004038 Where is Hank?
## 4 1000007540 ToshiCapital Rekordz Needs Help to Complete Album
## 5 1000011046 Community Film Project: The Art of Neighborhood Filmmaking
## 6 1000014025 Monarch Espresso Bar
## category main_category currency deadline goal
## 1 Poetry Publishing GBP 2015-10-09 1000
## 2 Narrative Film Film & Video USD 2017-11-01 30000
## 3 Narrative Film Film & Video USD 2013-02-26 45000
## 4 Music Music USD 2012-04-16 5000
## 5 Film & Video Film & Video USD 2015-08-29 19500
## 6 Restaurants Food USD 2016-04-01 50000
## launched pledged state backers country usd.pledged
## 1 2015-08-11 12:12:28 0 failed 0 GB 0
## 2 2017-09-02 04:43:57 2421 failed 15 US 100
## 3 2013-01-12 00:20:50 220 failed 3 US 220
## 4 2012-03-17 03:24:11 1 failed 1 US 1
## 5 2015-07-04 08:35:03 1283 canceled 14 US 1283
## 6 2016-02-26 13:38:27 52375 successful 224 US 52375
## usd_pledged_real usd_goal_real
## 1 0 1533.95
## 2 2421 30000.00
## 3 220 45000.00
## 4 1 5000.00
## 5 1283 19500.00
## 6 52375 50000.00
str(raw.data)
## 'data.frame': 378661 obs. of 15 variables:
## $ ID : int 1000002330 1000003930 1000004038 1000007540 1000011046 1000014025 1000023410 1000030581 1000034518 100004195 ...
## $ name : Factor w/ 375765 levels "","\177Not Twins - New EP! \"The View from Down Here\"",..: 332541 135689 365010 344805 77349 206130 293462 69360 284139 290718 ...
## $ category : Factor w/ 159 levels "3D Printing",..: 109 94 94 91 56 124 59 42 114 40 ...
## $ main_category : Factor w/ 15 levels "Art","Comics",..: 13 7 7 11 7 8 8 8 5 7 ...
## $ currency : Factor w/ 14 levels "AUD","CAD","CHF",..: 6 14 14 14 14 14 14 14 14 14 ...
## $ deadline : Factor w/ 3164 levels "2009-05-03","2009-05-16",..: 2288 3042 1333 1017 2247 2463 1996 2448 1790 1863 ...
## $ goal : num 1000 30000 45000 5000 19500 50000 1000 25000 125000 65000 ...
## $ launched : Factor w/ 378089 levels "1970-01-01 01:00:00",..: 243292 361975 80409 46557 235943 278600 187500 274014 139367 153766 ...
## $ pledged : num 0 2421 220 1 1283 ...
## $ state : Factor w/ 6 levels "canceled","failed",..: 2 2 2 2 1 4 4 2 1 1 ...
## $ backers : int 0 15 3 1 14 224 16 40 58 43 ...
## $ country : Factor w/ 23 levels "AT","AU","BE",..: 10 23 23 23 23 23 23 23 23 23 ...
## $ usd.pledged : num 0 100 220 1 1283 ...
## $ usd_pledged_real: num 0 2421 220 1 1283 ...
## $ usd_goal_real : num 1534 30000 45000 5000 19500 ...
#Libaries Required
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.5.1
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(fpp2)
## Warning: package 'fpp2' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## Loading required package: forecast
## Warning: package 'forecast' was built under R version 3.5.2
## Loading required package: fma
## Warning: package 'fma' was built under R version 3.5.2
## Loading required package: expsmooth
## Warning: package 'expsmooth' was built under R version 3.5.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Data Cleaning
raw.data$ID <- as.character(raw.data$ID)
raw.data$name <- as.character(raw.data$name)
raw.data$deadline <- as.Date(raw.data$deadline)
raw.data$launched <- as.Date(raw.data$launched)
prcd.data<- raw.data %>%
mutate(deadline_year = year(deadline),
deadline_month = month(deadline),
deadline_day = day(deadline),
deadline_YrMth=format(as.Date(raw.data$deadline), "%Y%m") )
## Warning: package 'bindrcpp' was built under R version 3.5.1
prcd.data<- prcd.data %>%
mutate(launched_year = year(launched),
launched_month = month(launched),
launched_day = day(launched),
launched_YrMth=format(as.Date(raw.data$launched), "%Y%m"))
#Focus on Failed and Successful
Fnl.data <- prcd.data %>% filter(state == "failed" | state == "successful")
Fnl.data$state <- as.character(Fnl.data$state)
Fnl.data$state <- as.factor(Fnl.data$state)
summary(Fnl.data$state)
## failed successful
## 197719 133956
#Plot the number of projects in each states of the projects
ggplot(prcd.data, aes(state)) +
geom_bar(fill = "#FF6666") +
ylab("# of Projects") + xlab("Final State") +
ggtitle("Final State of the Kickstarter projects")

#State of projects in each year based on deadline
ggplot(prcd.data, aes(x=deadline_year,fill=factor(state))) +
geom_bar() +
#theme(axis.text.x = element_text(angle = 45, vjust = 0.5)) +
#scale_x_discrete(breaks=prcd.data$deadline_year)+
ylab("No of Projects") + xlab("Year") +
ggtitle("State of the Kickstarter projects")

#Failed and Successful projects in each year based on deadline
State.by.year.smry <- as.data.frame(table(Fnl.data$deadline_year, Fnl.data$state))
names(State.by.year.smry) <- c('deadline_Year','State', 'Freq')
State.by.year.smry <- State.by.year.smry %>% group_by(deadline_Year) %>% arrange(desc(Freq))
ggplot(State.by.year.smry, aes(x = deadline_Year, y = `Freq`, fill = State)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Failed & Successful Projects deadline over time") +
scale_y_continuous("# of Projects deadline", labels = scales::comma) +
scale_x_discrete("Year")

# Plot Total Projects by all the different State
state.by.tot <- as.data.frame.matrix((as.matrix(table(raw.data$state))))
rw.names.state.by.tot<- row.names(state.by.tot)
state.by.tot <-cbind(rw.names.state.by.tot,state.by.tot)
row.names(state.by.tot) <- NULL
names(state.by.tot)<- c("State","Value")
ggplot(state.by.tot, aes('', Value, fill = State)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (Value/sum(Value))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "State", title = "Total Projects by Failed/Successful State")

state.by.tot <- as.data.frame.matrix((as.matrix(table(Fnl.data$state))))
rw.names.state.by.tot<- row.names(state.by.tot)
state.by.tot <-cbind(rw.names.state.by.tot,state.by.tot)
row.names(state.by.tot) <- NULL
names(state.by.tot)<- c("State","Value")
# Plot Total Projects by Failed /Successful State
ggplot(state.by.tot, aes('', Value, fill = State)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (Value/sum(Value))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "State", title = "Total Projects by Failed/Successful State")

# Identify the success project,failed projects,successrate,failedrate at year level
state_by_year <- as.data.frame.matrix( t(as.matrix(table(Fnl.data$state,Fnl.data$deadline_year))))
rw.names.state.yr<- row.names(state_by_year)
state_by_year<- as.data.frame((state_by_year))
state_by_year <-cbind(rw.names.state.yr,state_by_year)
row.names(state_by_year) <- NULL
names(state_by_year)[1]<- "Year"
head(state_by_year)
## Year failed successful
## 1 2009 410 384
## 2 2010 4297 4008
## 3 2011 11201 11768
## 4 2012 20760 18063
## 5 2013 21346 19198
## 6 2014 37038 21202
state_by_year <- state_by_year %>% mutate(
successrate= successful/(successful+failed),
failurerate= failed/(successful+failed)
)
p1 <-ggplot(state_by_year, aes(Year,successrate))
p1<-p1 +geom_bar(stat = "identity", aes(fill = successrate))
p2 <-ggplot(state_by_year, aes(Year,failurerate))
p2<- p2 +geom_bar(stat = "identity", aes(fill = failurerate))
grid.arrange(p1, p2, nrow=2)

# Identify the success project,failed projects,successrate,failedrate at yearmonth level
head(Fnl.data)
## ID name
## 1 1000002330 The Songs of Adelaide & Abullah
## 2 1000003930 Greeting From Earth: ZGAC Arts Capsule For ET
## 3 1000004038 Where is Hank?
## 4 1000007540 ToshiCapital Rekordz Needs Help to Complete Album
## 5 1000014025 Monarch Espresso Bar
## 6 1000023410 Support Solar Roasted Coffee & Green Energy! SolarCoffee.co
## category main_category currency deadline goal launched
## 1 Poetry Publishing GBP 2015-10-09 1000 2015-08-11
## 2 Narrative Film Film & Video USD 2017-11-01 30000 2017-09-02
## 3 Narrative Film Film & Video USD 2013-02-26 45000 2013-01-12
## 4 Music Music USD 2012-04-16 5000 2012-03-17
## 5 Restaurants Food USD 2016-04-01 50000 2016-02-26
## 6 Food Food USD 2014-12-21 1000 2014-12-01
## pledged state backers country usd.pledged usd_pledged_real
## 1 0 failed 0 GB 0 0
## 2 2421 failed 15 US 100 2421
## 3 220 failed 3 US 220 220
## 4 1 failed 1 US 1 1
## 5 52375 successful 224 US 52375 52375
## 6 1205 successful 16 US 1205 1205
## usd_goal_real deadline_year deadline_month deadline_day deadline_YrMth
## 1 1533.95 2015 10 9 201510
## 2 30000.00 2017 11 1 201711
## 3 45000.00 2013 2 26 201302
## 4 5000.00 2012 4 16 201204
## 5 50000.00 2016 4 1 201604
## 6 1000.00 2014 12 21 201412
## launched_year launched_month launched_day launched_YrMth
## 1 2015 8 11 201508
## 2 2017 9 2 201709
## 3 2013 1 12 201301
## 4 2012 3 17 201203
## 5 2016 2 26 201602
## 6 2014 12 1 201412
state_by_mth <- as.data.frame.matrix( t(as.matrix(table(Fnl.data$state,Fnl.data$deadline_YrMth))))
rw.names.state.mth<- row.names(state_by_mth)
state_by_mth<- as.data.frame((state_by_mth))
state_by_mth <-cbind(rw.names.state.mth,state_by_mth)
row.names(state_by_mth) <- NULL
names(state_by_mth)[1]<- "YrMonth"
head(state_by_mth)
## YrMonth failed successful
## 1 200905 8 3
## 2 200906 12 16
## 3 200907 21 38
## 4 200908 29 33
## 5 200909 38 41
## 6 200910 60 65
state_by_mth <- state_by_mth %>% mutate(
successrate= successful/(successful+failed),
failurerate= failed/(successful+failed)
)
p1 <-ggplot(state_by_mth, aes(YrMonth,successrate))
p1<-p1 +geom_bar(stat = "identity", aes(fill = successrate))
p2 <-ggplot(state_by_mth, aes(YrMonth,failurerate))
p2<- p2 +geom_bar(stat = "identity", aes(fill = failurerate))
grid.arrange(p1, p2, nrow=2)

#Failed and Successful projects in each Category
names(Fnl.data)
## [1] "ID" "name" "category"
## [4] "main_category" "currency" "deadline"
## [7] "goal" "launched" "pledged"
## [10] "state" "backers" "country"
## [13] "usd.pledged" "usd_pledged_real" "usd_goal_real"
## [16] "deadline_year" "deadline_month" "deadline_day"
## [19] "deadline_YrMth" "launched_year" "launched_month"
## [22] "launched_day" "launched_YrMth"
State.by.category.smry <- as.data.frame(table(Fnl.data$main_category, Fnl.data$state))
names(State.by.category.smry) <- c('main_category','State', 'Freq')
State.by.category.smry <- State.by.category.smry %>% group_by(main_category) %>% arrange(desc(Freq))
Failed.by.category<- State.by.category.smry %>% filter(State=="failed")
ggplot(Failed.by.category, aes(x = reorder(main_category,-Freq), y = `Freq`, fill = State)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ggtitle("Total Projects in Different Categories by Failed State") +
scale_y_continuous("# of Projects", labels = scales::comma) +
scale_x_discrete("Main Categories")

Successful.by.category<- State.by.category.smry %>% filter(State=="successful")
ggplot(Successful.by.category, aes(x = reorder(main_category,-Freq), y = `Freq`, fill = State)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ggtitle("Total Projects in Different Categories by Successful State") +
scale_y_continuous("# of Projects", labels = scales::comma) +
scale_x_discrete("Main Categories")

#Failed and Successful projects in each Category
names(Fnl.data)
## [1] "ID" "name" "category"
## [4] "main_category" "currency" "deadline"
## [7] "goal" "launched" "pledged"
## [10] "state" "backers" "country"
## [13] "usd.pledged" "usd_pledged_real" "usd_goal_real"
## [16] "deadline_year" "deadline_month" "deadline_day"
## [19] "deadline_YrMth" "launched_year" "launched_month"
## [22] "launched_day" "launched_YrMth"
State.by.category.year<- as.data.frame(table(Fnl.data$main_category,Fnl.data$deadline_year, Fnl.data$state))
names(State.by.category.year) <- c('main_category','Yr','State', 'Freq')
State.by.category.year <- State.by.category.year %>% group_by(main_category) %>% arrange(desc(Freq))
Failed.by.category.year<- State.by.category.year %>% filter(State=="failed") %>%
arrange(desc(Freq)) %>%
group_by(Yr) %>%
filter(row_number() <= 5L)
p1<- ggplot(Failed.by.category.year, aes(x = Yr, y = `Freq`, fill = main_category)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ggtitle("Total Projects in Different Categories by Failed State by year") +
scale_y_continuous("# of Projects", labels = scales::comma) +
scale_x_discrete("Year")
Successful.by.category.year<- State.by.category.year %>% filter(State=="successful") %>%
arrange(desc(Freq)) %>%
group_by(Yr) %>%
filter(row_number() <= 5L)
p2<- ggplot(Successful.by.category.year, aes(x = Yr, y = `Freq`, fill = main_category)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ggtitle("Total Projects in Different Categories by Successful State by year") +
scale_y_continuous("# of Projects", labels = scales::comma) +
scale_x_discrete("Year")
grid.arrange(p1,p2,nrow=2)

ggplot(Fnl.data, aes(deadline_year, fill = Fnl.data$state)) +
geom_bar() +
theme(legend.position = "bottom") +
facet_wrap( ~ main_category) +
ylab("Number of Projects") + xlab("Launched Year") +
ggtitle("KS projects launched over time by Category")

#Create time series
names(state_by_mth)
## [1] "YrMonth" "failed" "successful" "successrate" "failurerate"
head(state_by_mth)
## YrMonth failed successful successrate failurerate
## 1 200905 8 3 0.2727273 0.7272727
## 2 200906 12 16 0.5714286 0.4285714
## 3 200907 21 38 0.6440678 0.3559322
## 4 200908 29 33 0.5322581 0.4677419
## 5 200909 38 41 0.5189873 0.4810127
## 6 200910 60 65 0.5200000 0.4800000
state_by_mth %>% arrange(desc(YrMonth))
## YrMonth failed successful successrate failurerate
## 1 201801 134 86 0.3909091 0.6090909
## 2 201712 2277 1714 0.4294663 0.5705337
## 3 201711 2203 1789 0.4481463 0.5518537
## 4 201710 2139 1652 0.4357689 0.5642311
## 5 201709 2104 1483 0.4134374 0.5865626
## 6 201708 2237 1592 0.4157744 0.5842256
## 7 201707 2442 1630 0.4002947 0.5997053
## 8 201706 2432 1652 0.4045054 0.5954946
## 9 201705 2362 1754 0.4261419 0.5738581
## 10 201704 2337 1809 0.4363242 0.5636758
## 11 201703 2489 1905 0.4335457 0.5664543
## 12 201702 2026 1370 0.4034158 0.5965842
## 13 201701 2006 998 0.3322237 0.6677763
## 14 201612 2426 1706 0.4128751 0.5871249
## 15 201611 2284 1738 0.4321233 0.5678767
## 16 201610 2295 1585 0.4085052 0.5914948
## 17 201609 2195 1392 0.3880680 0.6119320
## 18 201608 2331 1443 0.3823529 0.6176471
## 19 201607 2766 1652 0.3739249 0.6260751
## 20 201606 2788 1627 0.3685164 0.6314836
## 21 201605 2952 1871 0.3879328 0.6120672
## 22 201604 2837 1734 0.3793481 0.6206519
## 23 201603 3126 1800 0.3654080 0.6345920
## 24 201602 2580 1276 0.3309129 0.6690871
## 25 201601 2521 1000 0.2840102 0.7159898
## 26 201512 3311 1771 0.3484848 0.6515152
## 27 201511 3139 1777 0.3614727 0.6385273
## 28 201510 3297 1716 0.3423100 0.6576900
## 29 201509 3339 1477 0.3066860 0.6933140
## 30 201508 3528 1644 0.3178654 0.6821346
## 31 201507 4038 1938 0.3242972 0.6757028
## 32 201506 3883 1849 0.3225750 0.6774250
## 33 201505 4435 2087 0.3199939 0.6800061
## 34 201504 4630 2160 0.3181149 0.6818851
## 35 201503 4763 2112 0.3072000 0.6928000
## 36 201502 3379 1416 0.2953076 0.7046924
## 37 201501 2636 1128 0.2996812 0.7003188
## 38 201412 3629 1823 0.3343727 0.6656273
## 39 201411 3720 1906 0.3387842 0.6612158
## 40 201410 3840 1936 0.3351801 0.6648199
## 41 201409 4687 1797 0.2771437 0.7228563
## 42 201408 6450 2207 0.2549382 0.7450618
## 43 201407 2960 1823 0.3811415 0.6188585
## 44 201406 2305 1722 0.4276136 0.5723864
## 45 201405 2288 1865 0.4490730 0.5509270
## 46 201404 2074 1845 0.4707834 0.5292166
## 47 201403 1993 1830 0.4786817 0.5213183
## 48 201402 1519 1260 0.4534005 0.5465995
## 49 201401 1573 1188 0.4302789 0.5697211
## 50 201312 2101 1675 0.4435911 0.5564089
## 51 201311 1900 1639 0.4631252 0.5368748
## 52 201310 1904 1549 0.4485954 0.5514046
## 53 201309 1673 1421 0.4592760 0.5407240
## 54 201308 1847 1693 0.4782486 0.5217514
## 55 201307 2022 1738 0.4622340 0.5377660
## 56 201306 1883 1779 0.4858001 0.5141999
## 57 201305 1918 1874 0.4941983 0.5058017
## 58 201304 1766 1768 0.5002830 0.4997170
## 59 201303 1636 1823 0.5270309 0.4729691
## 60 201302 1425 1221 0.4614512 0.5385488
## 61 201301 1271 1018 0.4447357 0.5552643
## 62 201212 1635 1332 0.4489383 0.5510617
## 63 201211 1605 1385 0.4632107 0.5367893
## 64 201210 1633 1442 0.4689431 0.5310569
## 65 201209 1766 1437 0.4486419 0.5513581
## 66 201208 1872 1651 0.4686347 0.5313653
## 67 201207 1948 1754 0.4737979 0.5262021
## 68 201206 2056 1697 0.4521716 0.5478284
## 69 201205 2064 1769 0.4615184 0.5384816
## 70 201204 1920 1781 0.4812213 0.5187787
## 71 201203 1759 1662 0.4858229 0.5141771
## 72 201202 1259 1110 0.4685521 0.5314479
## 73 201201 1243 1043 0.4562555 0.5437445
## 74 201112 1268 1125 0.4701212 0.5298788
## 75 201111 890 988 0.5260916 0.4739084
## 76 201110 1013 1060 0.5113362 0.4886638
## 77 201109 1086 1052 0.4920486 0.5079514
## 78 201108 1165 1231 0.5137730 0.4862270
## 79 201107 1075 1095 0.5046083 0.4953917
## 80 201106 999 1038 0.5095729 0.4904271
## 81 201105 1002 1119 0.5275813 0.4724187
## 82 201104 815 1044 0.5615923 0.4384077
## 83 201103 748 858 0.5342466 0.4657534
## 84 201102 582 601 0.5080304 0.4919696
## 85 201101 558 557 0.4995516 0.5004484
## 86 201012 557 555 0.4991007 0.5008993
## 87 201011 500 489 0.4944388 0.5055612
## 88 201010 533 462 0.4643216 0.5356784
## 89 201009 539 451 0.4555556 0.5444444
## 90 201008 563 465 0.4523346 0.5476654
## 91 201007 566 435 0.4345654 0.5654346
## 92 201006 353 331 0.4839181 0.5160819
## 93 201005 243 262 0.5188119 0.4811881
## 94 201004 140 203 0.5918367 0.4081633
## 95 201003 116 147 0.5589354 0.4410646
## 96 201002 94 104 0.5252525 0.4747475
## 97 201001 93 104 0.5279188 0.4720812
## 98 200912 147 98 0.4000000 0.6000000
## 99 200911 95 90 0.4864865 0.5135135
## 100 200910 60 65 0.5200000 0.4800000
## 101 200909 38 41 0.5189873 0.4810127
## 102 200908 29 33 0.5322581 0.4677419
## 103 200907 21 38 0.6440678 0.3559322
## 104 200906 12 16 0.5714286 0.4285714
## 105 200905 8 3 0.2727273 0.7272727
success.ts <- ts(state_by_mth[,3], frequency=12,start=c(2009,5))
autoplot(success.ts)

failed.ts <- ts(state_by_mth[,2], frequency=12,start=c(2009,5))
autoplot(failed.ts)

ggsubseriesplot(success.ts)

ggsubseriesplot(failed.ts)

#Analysis of projects based on backers count
prj.Bkt<- Fnl.data %>%
select(state,backers) %>% filter(state=="failed" | state== "successful") %>%
mutate(greater_1000_backers =ifelse(backers >=1000,1,0 ),less_1000_backers=ifelse(backers< 1000,1,0) ) %>%
arrange(desc(state)) %>% group_by(state) %>% summarise_all(funs(sum))
prj.bkrs.Plot<- as.data.frame(((apply(prj.Bkt[,3:4],2,sum))))
names(prj.bkrs.Plot)[1] <- "count"
backers_category<- rownames(prj.bkrs.Plot)
prj.bkrs.Plot <- cbind(backers_category,prj.bkrs.Plot)
rownames(prj.bkrs.Plot) <- NULL
ggplot(prj.bkrs.Plot, aes('', count, fill = backers_category)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (count/sum(count))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "Backers Category", title = "Projects based on backers counts")

#Analysis of backers influence on projects if count is greater than 1000
grt.bkrs.Plot <- prj.Bkt[, c("state","greater_1000_backers")]
ggplot(grt.bkrs.Plot, aes('', greater_1000_backers, fill = state)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (greater_1000_backers/sum(greater_1000_backers))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if backers > 1000")

#Analysis of backers influence on projects if count is less than 1000
less.bkrs.Plot <- prj.Bkt[, c("state","less_1000_backers")]
ggplot(less.bkrs.Plot, aes('', less_1000_backers, fill = state)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (less_1000_backers/sum(less_1000_backers))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if backers < 1000")

#Analysis of projects based on goal count
goal.prj.Bkt<- Fnl.data %>%
select(state,goal) %>% filter(state=="failed" | state== "successful") %>%
mutate(greater_1000_goal =ifelse(goal >=100,1,0 ) ,less_1000_goal=ifelse(goal< 1000,1,0)) %>%
arrange(desc(state)) %>% group_by(state) %>% summarise_all(funs(sum))
prj.goal.Plot<- as.data.frame(((apply(goal.prj.Bkt[,3:4],2,sum))))
names(prj.goal.Plot)[1] <- "count"
goal_category<- rownames(prj.goal.Plot)
prj.goal.Plot <- cbind(goal_category,prj.goal.Plot)
rownames(prj.goal.Plot) <- NULL
ggplot(prj.goal.Plot, aes('', count, fill = goal_category)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (count/sum(count))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "Goal Category", title = "Projects based on goal counts")

#Analysis of goal influence on projects if count is greater than 1000
grt.goal.Plot <- goal.prj.Bkt[, c("state","greater_1000_goal")]
ggplot(grt.goal.Plot, aes('', greater_1000_goal, fill = state)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (greater_1000_goal/sum(greater_1000_goal))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if goal > 1000")

#Analysis of goal influence on projects if count is less than 1000
less.goal.Plot <- goal.prj.Bkt[, c("state","less_1000_goal")]
ggplot(less.goal.Plot, aes('', less_1000_goal, fill = state)) +
geom_col(position = 'fill') +
geom_label(aes(label = paste(format(round( (less_1000_goal/sum(less_1000_goal))*100, 2), nsmall = 2),"%")), position = position_fill(vjust = 0.6)) +
coord_polar(theta = 'y')+
labs(x = NULL, y = NULL, fill = "Project State Category", title = "Project Success rate| if goal < 1000")

#Required data for time series creation
failed_req_var <- Fnl.data %>% select(state,deadline_YrMth,goal,usd_pledged_real,backers) %>% filter(state=="failed") %>% group_by(state,deadline_YrMth) %>% summarise_all(funs(sum) )
failed_req_var <- failed_req_var[,2:5]
names(failed_req_var)[2] <- "failed_goal"
names(failed_req_var)[3] <- "failed_Usd_pledged_real"
names(failed_req_var)[4] <- "failed_backer"
failed_req_var$deadline_YrMth <- as.factor(failed_req_var$deadline_YrMth)
Data.for.ts<- state_by_mth %>% inner_join(failed_req_var, by = c("YrMonth" = "deadline_YrMth"))
successful_req_var <- Fnl.data %>% select(state,deadline_YrMth,goal,usd_pledged_real,backers) %>% filter(state=="successful") %>% group_by(state,deadline_YrMth) %>% summarise_all(funs(sum) )
successful_req_var <- successful_req_var[,2:5]
names(successful_req_var)[2] <- "successful_goal"
names(successful_req_var)[3] <- "successful_Usd_pledged_real"
names(successful_req_var)[4] <- "successful_backer"
successful_req_var$deadline_YrMth <- as.factor(successful_req_var$deadline_YrMth)
Data.for.ts<- Data.for.ts %>% inner_join(successful_req_var, by = c("YrMonth" = "deadline_YrMth"))
#Splitting the data into training and test data
kck.str.ts <- ts(Data.for.ts[,2:11], frequency=12,start=c(2009,5),end=c(2017,12))
kck.str.train.ts<- window(kck.str.ts,end=c(2016,12))
kck.str.test.ts<- window(kck.str.ts,start=c(2017,01),end=c(2017,12))
#Decompose Success State Time Series
Success.decomp <- stl(kck.str.ts[,2],s.window = "periodic")
autoplot(Success.decomp) + ggtitle("Decomposition of Successful Projects over time")

#Number of Successful Project forecast using ETS Method
success.ets.fst <- ets(kck.str.train.ts[,2])
success.ets.Pred <- forecast( success.ets.fst, h=length(kck.str.test.ts[,2]))
round(accuracy(success.ets.Pred,kck.str.test.ts[,2]),3)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 3.644 224.055 142.464 -0.016 12.502 0.450 -0.020
## Test set -185.455 284.993 219.313 -14.309 16.108 0.692 0.302
## Theil's U
## Training set NA
## Test set 0.893
#Check the residuals of the ets method
checkresiduals(success.ets.fst)

##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,N)
## Q* = 88.09, df = 14.4, p-value = 1.278e-12
##
## Model df: 4. Total lags used: 18.4
autoplot(kck.str.train.ts[,2],series = "Data" ) +
autolayer(fitted(success.ets.fst), series = "Fitted") +
autolayer(success.ets.Pred, PI = TRUE, series = "ETS") +
xlab("Year") + ylab("No of Projects") +
ggtitle("No of Successful Projects Forecast using ETS Method") +
guides(colour = guide_legend(title = "Model"))

#Number of Successful Project forecast using ARIMA Method
success.arima.fst <- auto.arima(kck.str.train.ts[,2] )
success.arima.Pred <- forecast( success.arima.fst, h=length(kck.str.test.ts[,2]))
#Accuracy
round(accuracy(success.arima.Pred,kck.str.test.ts[,2]),3)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set -4.538 130.130 87.362 -0.156 6.520 0.276 -0.066 NA
## Test set 68.891 95.157 85.882 4.393 5.371 0.271 0.134 0.407
#Plot the model using Arima
autoplot(success.arima.Pred)+
xlab("Year") + ylab("No of Projects") +
ggtitle("No of Successful Projects Forecast using ARIMA") +
guides(colour = guide_legend(title = "Model"))

#Check the residuals of the ARIMA method
checkresiduals(success.arima.Pred)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)(0,1,1)[12]
## Q* = 25.499, df = 17.4, p-value = 0.09453
##
## Model df: 1. Total lags used: 18.4
prediction=predict(success.arima.fst,n.ahead=12)
#Plot the data vs fitted vs validation set using ets method
autoplot(kck.str.train.ts[,2],series = "Data" ) +
autolayer(fitted(success.arima.Pred), series = "Fitted") +
autolayer(success.arima.Pred, PI = TRUE, series = "ARIMA") +
xlab("Year") + ylab("No of Projects") +
ggtitle("No of Successful Projects Forecast using ARIMA Method") +
guides(colour = guide_legend(title = "Model"))

success.arima.fst %>% forecast(h=12) %>% autoplot()

#Check the residuals of the ets method
checkresiduals(success.arima.fst)

##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)(0,1,1)[12]
## Q* = 25.499, df = 17.4, p-value = 0.09453
##
## Model df: 1. Total lags used: 18.4
summary(success.arima.fst)
## Series: kck.str.train.ts[, 2]
## ARIMA(0,1,0)(0,1,1)[12]
##
## Coefficients:
## sma1
## -0.2206
## s.e. 0.1066
##
## sigma^2 estimated as 19973: log likelihood=-503.03
## AIC=1010.05 AICc=1010.21 BIC=1014.79
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -4.537775 130.1305 87.36204 -0.1557508 6.520436 0.275764
## ACF1
## Training set -0.06605202
#Correlation of required variables
kck.str.ts[,c("successful","successful_backer","successful_goal","successful_Usd_pledged_real")] %>%
as.data.frame() %>%
GGally::ggpairs()

#Number of successful projects using dynamic regression
xreg <- cbind(successful_goal = kck.str.train.ts[, "successful_goal"],
successful_backer = kck.str.train.ts[, "successful_backer"],
successful_Usd_pledged_real= kck.str.train.ts[,"successful_Usd_pledged_real"])
success.dym.arima.fst <- auto.arima(kck.str.train.ts[,"successful"],
xreg=xreg,seasonal = TRUE,approximation = FALSE, stepwise = FALSE)
checkresiduals(success.dym.arima.fst)

##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,0,0)(1,0,0)[12] errors
## Q* = 13.649, df = 12.4, p-value = 0.3534
##
## Model df: 6. Total lags used: 18.4
success.dym.arima.pred <- forecast(success.dym.arima.fst, xreg=
cbind( rep(mean(kck.str.ts[,"successful_goal"]),12),
rep(mean(kck.str.ts[,"successful_backer"]),12),
rep(mean(kck.str.ts[,"successful_Usd_pledged_real"]),12)
) )
## Warning in forecast.Arima(success.dym.arima.fst, xreg =
## cbind(rep(mean(kck.str.ts[, : xreg contains different column names from
## the xreg used in training. Please check that the regressors are in the same
## order.
round(accuracy(success.dym.arima.pred,kck.str.test.ts[,2]),3)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## Training set 12.385 118.864 84.452 -64.497 72.999 0.267 -0.147 NA
## Test set 392.816 428.058 395.601 23.105 23.384 1.249 0.293 1.604
summary(success.dym.arima.pred)
##
## Forecast method: Regression with ARIMA(1,0,0)(1,0,0)[12] errors
##
## Model Information:
## Series: kck.str.train.ts[, "successful"]
## Regression with ARIMA(1,0,0)(1,0,0)[12] errors
##
## Coefficients:
## Warning in sqrt(diag(x$var.coef)): NaNs produced
## ar1 sar1 intercept successful_goal successful_backer
## 0.9061 0.5852 607.362 0 -2e-04
## s.e. 0.0383 0.0373 NaN NaN 2e-04
## successful_Usd_pledged_real
## 0
## s.e. NaN
##
## sigma^2 estimated as 15114: log likelihood=-573.69
## AIC=1161.38 AICc=1162.71 BIC=1179.03
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 12.38458 118.8638 84.45157 -64.49652 72.9986 0.2665769
## ACF1
## Training set -0.1468526
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2017 1014.709 857.1546 1172.263 773.7503 1255.668
## Feb 2017 1203.315 990.7029 1415.928 878.1527 1528.478
## Mar 2017 1330.834 1081.9629 1579.706 950.2183 1711.451
## Apr 2017 1299.894 1024.8027 1574.985 879.1781 1720.610
## May 2017 1327.804 1032.9233 1622.685 876.8228 1778.785
## Jun 2017 1179.667 869.4814 1489.854 705.2788 1654.056
## Jul 2017 1228.051 905.8417 1550.260 735.2746 1720.827
## Aug 2017 1204.053 872.2980 1535.807 696.6777 1711.428
## Sep 2017 1232.897 893.5055 1572.288 713.8427 1751.951
## Oct 2017 1205.324 859.7888 1550.859 676.8737 1733.774
## Nov 2017 1280.788 930.2891 1631.286 744.7464 1816.829
## Dec 2017 1126.866 772.3446 1481.388 584.6721 1669.061
autoplot(success.dym.arima.pred) + xlab("Year") +
ylab("No of successful projects")

#Plot the data vs fitted vs validation set using dynamic regression method
autoplot(kck.str.train.ts[,2],series = "Data" ) +
autolayer(fitted(success.dym.arima.pred), series = "Fitted") +
autolayer(success.dym.arima.pred, PI = TRUE, series = "Dynamic Regression with ARIMA") +
xlab("Year") + ylab("No of Projects") +
ggtitle("No of Successful Projects Forecast using Dynamic Regresssion Method") +
guides(colour = guide_legend(title = "Model"))

#Forecasting of all the models
autoplot(kck.str.train.ts[,2],series = "Data" ) +
autolayer(success.dym.arima.pred, PI = FALSE, series = "DYNAMIC REGRESSION ARIMA") +
autolayer(success.arima.Pred, PI = FALSE, series = " ARIMA") +
autolayer(success.ets.Pred, PI = FALSE, series = " ETS") +
xlab("Year") + ylab("No of Projects") +
ggtitle("No of Successful Projects Forecast") +
guides(colour = guide_legend(title = "Model"))

#Get the metrics required to select the model
round(accuracy(success.ets.Pred,kck.str.test.ts[,2]),3)[,2]
## Training set Test set
## 224.055 284.993
round(accuracy(success.arima.Pred,kck.str.test.ts[,2]),3)[,2]
## Training set Test set
## 130.130 95.157
round(accuracy(success.dym.arima.pred,kck.str.test.ts[,2]),3)[,2]
## Training set Test set
## 118.864 428.058
success.ets.fst$aic
## [1] 1348.611
success.ets.fst$aicc
## [1] 1349.309
success.arima.fst$aic
## [1] 1010.055
success.arima.fst$aicc
## [1] 1010.212
success.dym.arima.fst$aic
## [1] 1161.379
success.dym.arima.fst$aicc
## [1] 1162.712
summary(success.arima.Pred)[,1]
##
## Forecast method: ARIMA(0,1,0)(0,1,1)[12]
##
## Model Information:
## Series: kck.str.train.ts[, 2]
## ARIMA(0,1,0)(0,1,1)[12]
##
## Coefficients:
## sma1
## -0.2206
## s.e. 0.1066
##
## sigma^2 estimated as 19973: log likelihood=-503.03
## AIC=1010.05 AICc=1010.21 BIC=1014.79
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -4.537775 130.1305 87.36204 -0.1557508 6.520436 0.275764
## ACF1
## Training set -0.06605202
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2017 964.3314 783.2134 1145.449 687.3353 1241.327
## Feb 2017 1233.5371 977.3975 1489.677 841.8054 1625.269
## Mar 2017 1789.3924 1475.6868 2103.098 1309.6211 2269.164
## Apr 2017 1746.6127 1384.3767 2108.849 1192.6205 2300.605
## May 2017 1842.5055 1437.5134 2247.498 1223.1235 2461.888
## Jun 2017 1605.0312 1161.3845 2048.678 926.5322 2283.530
## Jul 2017 1643.4428 1164.2496 2122.636 910.5801 2376.306
## Aug 2017 1444.0499 931.7709 1956.329 660.5867 2227.513
## Sep 2017 1357.2985 813.9445 1900.653 526.3103 2188.287
## Oct 2017 1555.1727 982.4273 2127.918 679.2342 2431.111
## Nov 2017 1684.3969 1083.6965 2285.097 765.7048 2603.089
## Dec 2017 1655.5415 1028.1303 2282.953 695.9989 2615.084
## [1] 964.3314 1233.5371 1789.3924 1746.6127 1842.5055 1605.0312 1643.4428
## [8] 1444.0499 1357.2985 1555.1727 1684.3969 1655.5415
autoplot(success.arima.Pred$mean)

plot.fst.final<- as.data.frame(as.matrix(success.arima.Pred$mean) , stringsAsFactors = FALSE)
names(plot.fst.final)[1] <- "Forecast"
row.names(plot.fst.final)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
rownames<- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
plot.fst <- cbind(rownames,plot.fst.final)
plot.fst$rownames <- factor(plot.fst$rownames,levels=c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))
str(plot.fst)
## 'data.frame': 12 obs. of 2 variables:
## $ rownames: Factor w/ 12 levels "Jan","Feb","Mar",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Forecast: num 964 1234 1789 1747 1843 ...
ggplot(data=plot.fst, aes(x=rownames,y=Forecast))+
geom_point()+
xlab("Month") + ylab("No of Projects") +
ggtitle("Estimation of Successful Projects for next year")
