setwd("C:/Users/Lelan/Documents/Education/CUNY/DATA621/DATA621 - Data Mining/WeekFinal")
kick_final <- readRDS(file='the_final.rds')
install.packages("kableExtra", repos='https://mirrors.nics.utk.edu/cran/')
## Installing package into 'C:/Users/Lelan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'kableExtra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Lelan\AppData\Local\Temp\RtmpAfhjFb\downloaded_packages
library(kableExtra)
install.packages("DataExplorer", repos='https://mirrors.nics.utk.edu/cran/')
## Installing package into 'C:/Users/Lelan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'DataExplorer' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Lelan\AppData\Local\Temp\RtmpAfhjFb\downloaded_packages
library(DataExplorer)
library(data.table)
rc <- c('id', 'state', 'blurb', 'category.slug', 'category.slug', 'country', 'disable_communication', 'fx_rate', 'goal', 'location.type', 'static_usd_rate', 'profile.link_url', 'creator.name', 'currency_trailing_code', 'creator.urls.web.user', 'current_currency', 'creator.slug', 'launched_at and created_at', 'launched_at', 'created_at', 'created_at', 'created_at', 'created_at', 'deadline', 'deadline', 'deadline', 'blurb')
fc <- c('id', 'state', 'blurb', 'parent_cat', 'sub_cat', 'country', 'disable_communication', 'fx_rate', 'goal', 'location.type', 'static_usd_rate', 'profile.link_url_dummy', 'creator_name_nchar', 'currency_trailing_code', 'binary_web_slug', 'cur_cur_binary', 'prior_slug_count', 'launch_delay', 'launch_month', 'create_month', 'year', 'month', 'day', 'year_deadline', 'month_deadline', 'day1_deadline', 'blurb_1 - blurb_100')
desc <- c('Unique ID ', 'The state of the project: \"failed\" or \"successful\"', 'Long-form text description of project', 'The parent category for the project: "music", "journalism", "art", "theater", "technology", "film & video", "publishing", "design", "games", "photography", "comics", "food", "fashion", "crafts" or "dance"', 'Sub-category for the project', '2-digit country code (22 different countries)', 'Whether communication for the project was disabled (0 = no, 1 = yes)', 'Foreign exchange rate associated with the funding for the project', 'The funding goal for the project', 'A dummy variable based on the location.type. If location.type is "Town", then this is set to 0, otherwise it is set to 1.', 'The fixed US dollar exchange rate for the first transaction', 'If the profile for the project has a linked url, this is 1, otherwise it is 0.', 'The number of characters in the name of the project creator', 'When or not the currency has a trailing code. If "True", then 1, if "False", then 0.', 'If the creator slug from the creator.urls.web.user is numeric, this is 1, if it is a word it is 0', 'If the current_currency value is "CAD", this is 0, if it is "USD", it is 1.', 'A cumulative indicator for number of projects submitted by a ', 'The difference between launched_at and created_at.', 'The month extracted from the "launched_at" date', 'The month extracted from the "created_at" date', 'Year from "created_at" field', 'Month from "created_at" field', 'Day from "created_at" field', 'Year from "deadline" field', 'Month from "deadline" field', 'Day from "deadline" field', 'The top 100 terms from the singular value decomposition of the term frequency–inverse document frequency matrix generated from the blurb text')
dd <- cbind(rc,fc,desc)
kable(dd, col.names = c("Raw Column","Final Column","Definition"), caption = "Data Dictionary") %>% kable_styling(bootstrap_options = c("striped","condensed"), full_width = TRUE, font_size = 9, latex_options="scale_down")
Data Dictionary
Raw Column Final Column Definition
id id Unique ID
state state The state of the project: “failed” or “successful”
blurb blurb Long-form text description of project
category.slug parent_cat The parent category for the project: “music”, “journalism”, “art”, “theater”, “technology”, “film & video”, “publishing”, “design”, “games”, “photography”, “comics”, “food”, “fashion”, “crafts” or “dance”
category.slug sub_cat Sub-category for the project
country country 2-digit country code (22 different countries)
disable_communication disable_communication Whether communication for the project was disabled (0 = no, 1 = yes)
fx_rate fx_rate Foreign exchange rate associated with the funding for the project
goal goal The funding goal for the project
location.type location.type A dummy variable based on the location.type. If location.type is “Town”, then this is set to 0, otherwise it is set to 1.
static_usd_rate static_usd_rate The fixed US dollar exchange rate for the first transaction
profile.link_url profile.link_url_dummy If the profile for the project has a linked url, this is 1, otherwise it is 0.
creator.name creator_name_nchar The number of characters in the name of the project creator
currency_trailing_code currency_trailing_code When or not the currency has a trailing code. If “True”, then 1, if “False”, then 0.
creator.urls.web.user binary_web_slug If the creator slug from the creator.urls.web.user is numeric, this is 1, if it is a word it is 0
current_currency cur_cur_binary If the current_currency value is “CAD”, this is 0, if it is “USD”, it is 1.
creator.slug prior_slug_count A cumulative indicator for number of projects submitted by a
launched_at and created_at launch_delay The difference between launched_at and created_at.
launched_at launch_month The month extracted from the “launched_at” date
created_at create_month The month extracted from the “created_at” date
created_at year Year from “created_at” field
created_at month Month from “created_at” field
created_at day Day from “created_at” field
deadline year_deadline Year from “deadline” field
deadline month_deadline Month from “deadline” field
deadline day1_deadline Day from “deadline” field
blurb blurb_1 - blurb_100 The top 100 terms from the singular value decomposition of the term frequency–inverse document frequency matrix generated from the blurb text


EDA:

i_kick <- transpose(introduce(kick_final))
rownames(i_kick) <- colnames(introduce(i_kick))
colnames(i_kick)[1] <- "Counts"
knitr::kable(i_kick, caption = "Counts for Kickstarter Data Set")
Counts for Kickstarter Data Set
Counts
rows 173879
columns 126
discrete_columns 13
continuous_columns 113
all_missing_columns 0
total_missing_values 1
complete_rows 173878
total_observations 21908754
memory_usage 194511328


plot_intro(kick_final[,c(2,4:17)])

plot_bar(kick_final[,c(2,4:7,10:12,14:17)])
## 1 columns ignored with more than 50 categories.
## sub_cat: 159 categories

plot_boxplot(kick_final[,c(2,4:17)], by ="state")

plot_density(kick_final[,c(9,13,18)])

plot_scatterplot(kick_final[,c(2,4:17)], by = "state")