setwd("C:/Users/Lelan/Documents/Education/CUNY/DATA621/DATA621 - Data Mining/WeekFinal")
kick_final <- readRDS(file='the_final.rds')
install.packages("kableExtra", repos='https://mirrors.nics.utk.edu/cran/')
## Installing package into 'C:/Users/Lelan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'kableExtra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Lelan\AppData\Local\Temp\RtmpAfhjFb\downloaded_packages
library(kableExtra)
install.packages("DataExplorer", repos='https://mirrors.nics.utk.edu/cran/')
## Installing package into 'C:/Users/Lelan/Documents/R/win-library/3.5'
## (as 'lib' is unspecified)
## package 'DataExplorer' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Lelan\AppData\Local\Temp\RtmpAfhjFb\downloaded_packages
library(DataExplorer)
library(data.table)
rc <- c('id', 'state', 'blurb', 'category.slug', 'category.slug', 'country', 'disable_communication', 'fx_rate', 'goal', 'location.type', 'static_usd_rate', 'profile.link_url', 'creator.name', 'currency_trailing_code', 'creator.urls.web.user', 'current_currency', 'creator.slug', 'launched_at and created_at', 'launched_at', 'created_at', 'created_at', 'created_at', 'created_at', 'deadline', 'deadline', 'deadline', 'blurb')
fc <- c('id', 'state', 'blurb', 'parent_cat', 'sub_cat', 'country', 'disable_communication', 'fx_rate', 'goal', 'location.type', 'static_usd_rate', 'profile.link_url_dummy', 'creator_name_nchar', 'currency_trailing_code', 'binary_web_slug', 'cur_cur_binary', 'prior_slug_count', 'launch_delay', 'launch_month', 'create_month', 'year', 'month', 'day', 'year_deadline', 'month_deadline', 'day1_deadline', 'blurb_1 - blurb_100')
desc <- c('Unique ID ', 'The state of the project: \"failed\" or \"successful\"', 'Long-form text description of project', 'The parent category for the project: "music", "journalism", "art", "theater", "technology", "film & video", "publishing", "design", "games", "photography", "comics", "food", "fashion", "crafts" or "dance"', 'Sub-category for the project', '2-digit country code (22 different countries)', 'Whether communication for the project was disabled (0 = no, 1 = yes)', 'Foreign exchange rate associated with the funding for the project', 'The funding goal for the project', 'A dummy variable based on the location.type. If location.type is "Town", then this is set to 0, otherwise it is set to 1.', 'The fixed US dollar exchange rate for the first transaction', 'If the profile for the project has a linked url, this is 1, otherwise it is 0.', 'The number of characters in the name of the project creator', 'When or not the currency has a trailing code. If "True", then 1, if "False", then 0.', 'If the creator slug from the creator.urls.web.user is numeric, this is 1, if it is a word it is 0', 'If the current_currency value is "CAD", this is 0, if it is "USD", it is 1.', 'A cumulative indicator for number of projects submitted by a ', 'The difference between launched_at and created_at.', 'The month extracted from the "launched_at" date', 'The month extracted from the "created_at" date', 'Year from "created_at" field', 'Month from "created_at" field', 'Day from "created_at" field', 'Year from "deadline" field', 'Month from "deadline" field', 'Day from "deadline" field', 'The top 100 terms from the singular value decomposition of the term frequency–inverse document frequency matrix generated from the blurb text')
dd <- cbind(rc,fc,desc)
kable(dd, col.names = c("Raw Column","Final Column","Definition"), caption = "Data Dictionary") %>% kable_styling(bootstrap_options = c("striped","condensed"), full_width = TRUE, font_size = 9, latex_options="scale_down")
Data Dictionary
|
Raw Column
|
Final Column
|
Definition
|
|
id
|
id
|
Unique ID
|
|
state
|
state
|
The state of the project: “failed” or “successful”
|
|
blurb
|
blurb
|
Long-form text description of project
|
|
category.slug
|
parent_cat
|
The parent category for the project: “music”, “journalism”, “art”, “theater”, “technology”, “film & video”, “publishing”, “design”, “games”, “photography”, “comics”, “food”, “fashion”, “crafts” or “dance”
|
|
category.slug
|
sub_cat
|
Sub-category for the project
|
|
country
|
country
|
2-digit country code (22 different countries)
|
|
disable_communication
|
disable_communication
|
Whether communication for the project was disabled (0 = no, 1 = yes)
|
|
fx_rate
|
fx_rate
|
Foreign exchange rate associated with the funding for the project
|
|
goal
|
goal
|
The funding goal for the project
|
|
location.type
|
location.type
|
A dummy variable based on the location.type. If location.type is “Town”, then this is set to 0, otherwise it is set to 1.
|
|
static_usd_rate
|
static_usd_rate
|
The fixed US dollar exchange rate for the first transaction
|
|
profile.link_url
|
profile.link_url_dummy
|
If the profile for the project has a linked url, this is 1, otherwise it is 0.
|
|
creator.name
|
creator_name_nchar
|
The number of characters in the name of the project creator
|
|
currency_trailing_code
|
currency_trailing_code
|
When or not the currency has a trailing code. If “True”, then 1, if “False”, then 0.
|
|
creator.urls.web.user
|
binary_web_slug
|
If the creator slug from the creator.urls.web.user is numeric, this is 1, if it is a word it is 0
|
|
current_currency
|
cur_cur_binary
|
If the current_currency value is “CAD”, this is 0, if it is “USD”, it is 1.
|
|
creator.slug
|
prior_slug_count
|
A cumulative indicator for number of projects submitted by a
|
|
launched_at and created_at
|
launch_delay
|
The difference between launched_at and created_at.
|
|
launched_at
|
launch_month
|
The month extracted from the “launched_at” date
|
|
created_at
|
create_month
|
The month extracted from the “created_at” date
|
|
created_at
|
year
|
Year from “created_at” field
|
|
created_at
|
month
|
Month from “created_at” field
|
|
created_at
|
day
|
Day from “created_at” field
|
|
deadline
|
year_deadline
|
Year from “deadline” field
|
|
deadline
|
month_deadline
|
Month from “deadline” field
|
|
deadline
|
day1_deadline
|
Day from “deadline” field
|
|
blurb
|
blurb_1 - blurb_100
|
The top 100 terms from the singular value decomposition of the term frequency–inverse document frequency matrix generated from the blurb text
|
EDA:
i_kick <- transpose(introduce(kick_final))
rownames(i_kick) <- colnames(introduce(i_kick))
colnames(i_kick)[1] <- "Counts"
knitr::kable(i_kick, caption = "Counts for Kickstarter Data Set")
Counts for Kickstarter Data Set
|
|
Counts
|
|
rows
|
173879
|
|
columns
|
126
|
|
discrete_columns
|
13
|
|
continuous_columns
|
113
|
|
all_missing_columns
|
0
|
|
total_missing_values
|
1
|
|
complete_rows
|
173878
|
|
total_observations
|
21908754
|
|
memory_usage
|
194511328
|
plot_intro(kick_final[,c(2,4:17)])

plot_bar(kick_final[,c(2,4:7,10:12,14:17)])
## 1 columns ignored with more than 50 categories.
## sub_cat: 159 categories

plot_boxplot(kick_final[,c(2,4:17)], by ="state")

plot_density(kick_final[,c(9,13,18)])

plot_scatterplot(kick_final[,c(2,4:17)], by = "state")

