Data Importing and Cleaning/Tidying
Importing The Data (CPI Data and VC Investment Data from 1913 to 2014)
library(anytime)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(stringr)
setwd("~/Desktop/R Datasets/Data Science Datasets")
vc_data <- read.csv("investments_VC.csv",na.strings = "",stringsAsFactors = FALSE)
library("jsonlite")
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
json_file <- 'https://datahub.io/core/cpi-us/datapackage.json'
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
## Warning in readLines(json_file): incomplete final line found on 'https://
## datahub.io/core/cpi-us/datapackage.json'
# get list of all resources:
print(json_data$resources$name)
## [1] "validation_report" "cpiai_csv" "cpiai_json"
## [4] "cpi-us_zip" "cpiai"
# print all tabular data(if exists any)
for(i in 1:length(json_data$resources$datahub$type)){
if(json_data$resources$datahub$type[i]=='derived/csv'){
path_to_file = json_data$resources$path[i]
data <- read.csv(url(path_to_file))
}
}
Structure, head, and tail of Data
str(vc_data)
## 'data.frame': 49438 obs. of 34 variables:
## $ name : chr "#waywire" "&TV Communications" "'Rock' Your Paper" "(In)Touch Network" ...
## $ category_list : chr "|Entertainment|Politics|Social Media|News|" "|Games|" "|Publishing|Education|" "|Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|" ...
## $ market : chr " News " " Games " " Publishing " " Electronics " ...
## $ funding_total_usd : chr " 17,50,000 " " 40,00,000 " "40,000" " 15,00,000 " ...
## $ status : chr "acquired" "operating" "operating" "operating" ...
## $ country_code : chr "USA" "USA" "EST" "GBR" ...
## $ state_code : chr "NY" "CA" NA NA ...
## $ region : chr "New York City" "Los Angeles" "Tallinn" "London" ...
## $ city : chr "New York" "Los Angeles" "Tallinn" "London" ...
## $ funding_rounds : int 1 2 1 1 2 1 1 1 1 1 ...
## $ founded_at : chr "6/1/12" NA "10/26/12" "4/1/11" ...
## $ founded_month : chr "2012-06" NA "2012-10" "2011-04" ...
## $ founded_quarter : chr "2012-Q2" NA "2012-Q4" "2011-Q2" ...
## $ founded_year : int 2012 NA 2012 2011 2014 2011 NA 2007 2010 NA ...
## $ first_funding_at : chr "6/30/12" "6/4/10" "8/9/12" "4/1/11" ...
## $ last_funding_at : chr "6/30/12" "9/23/10" "8/9/12" "4/1/11" ...
## $ seed : int 1750000 0 40000 1500000 0 0 0 0 0 41250 ...
## $ venture : num 0e+00 4e+06 0e+00 0e+00 0e+00 7e+06 0e+00 2e+06 0e+00 0e+00 ...
## $ equity_crowdfunding : int 0 0 0 0 60000 0 0 0 0 0 ...
## $ undisclosed : int 0 0 0 0 0 0 4912393 0 0 0 ...
## $ convertible_note : int 0 0 0 0 0 0 0 0 0 0 ...
## $ debt_financing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ angel : int 0 0 0 0 0 0 0 0 0 0 ...
## $ grant : int 0 0 0 0 0 0 0 0 0 0 ...
## $ private_equity : num 0 0 0 0 0 0 0 0 0 0 ...
## $ post_ipo_equity : num 0 0 0 0 0 0 0 0 0 0 ...
## $ post_ipo_debt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ secondary_market : int 0 0 0 0 0 0 0 0 0 0 ...
## $ product_crowdfunding: int 0 0 0 0 0 0 0 0 0 0 ...
## $ round_A : int 0 0 0 0 0 0 0 2000000 0 0 ...
## $ round_B : int 0 0 0 0 0 7000000 0 0 0 0 ...
## $ round_C : int 0 0 0 0 0 0 0 0 0 0 ...
## $ round_D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ round_E : int 0 0 0 0 0 0 0 0 0 0 ...
head(vc_data)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 6/1/12 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 10/26/12 2012-10
## 4 London London 1 4/1/11 2011-04
## 5 Dallas Fort Worth 2 1/1/14 2014-01
## 6 Ft. Lauderdale Oakland Park 1 10/10/11 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 6/30/12 6/30/12 1750000 0e+00
## 2 <NA> NA 6/4/10 9/23/10 0 4e+06
## 3 2012-Q4 2012 8/9/12 8/9/12 40000 0e+00
## 4 2011-Q2 2011 4/1/11 4/1/11 1500000 0e+00
## 5 2014-Q1 2014 8/17/14 9/26/14 0 0e+00
## 6 2011-Q4 2011 5/31/13 5/31/13 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 7000000 0 0 0
tail(vc_data)
## name
## 49433 Zytoprotec
## 49434 Zzish
## 49435 ZZNode Science and Technology
## 49436 Zzzzapp Wireless ltd.
## 49437 [a]list games
## 49438 [x+1]
## category_list
## 49433 |Biotechnology|
## 49434 |Analytics|Gamification|Developer APIs|iOS|Android|Education|
## 49435 |Enterprise Software|
## 49436 |Web Development|Advertising|Wireless|Mobile|
## 49437 |Games|
## 49438 |Enterprise Software|
## market funding_total_usd status country_code state_code
## 49433 Biotechnology 26,86,600 operating AUT <NA>
## 49434 Education 3,20,000 operating GBR <NA>
## 49435 Enterprise Software 15,87,301 operating CHN <NA>
## 49436 Web Development 97,398 operating HRV <NA>
## 49437 Games 93,00,000 operating <NA> <NA>
## 49438 Enterprise Software 4,50,00,000 operating USA NY
## region city funding_rounds founded_at founded_month
## 49433 Vienna Gerasdorf Bei Wien 1 1/1/07 2007-01
## 49434 London London 1 1/28/13 2013-01
## 49435 Beijing Beijing 1 <NA> <NA>
## 49436 Split Split 5 5/13/12 2012-05
## 49437 <NA> <NA> 1 <NA> <NA>
## 49438 New York City New York 4 1/1/99 1999-01
## founded_quarter founded_year first_funding_at last_funding_at seed
## 49433 2007-Q1 2007 1/29/13 1/29/13 0
## 49434 2013-Q1 2013 3/24/14 3/24/14 320000
## 49435 <NA> NA 4/1/12 4/1/12 0
## 49436 2012-Q2 2012 11/1/11 9/10/14 71525
## 49437 <NA> NA 11/21/11 11/21/11 9300000
## 49438 1999-Q1 1999 6/1/08 4/4/13 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 49433 2686600 0 0 0 0.0e+00
## 49434 0 0 0 0 0.0e+00
## 49435 1587301 0 0 0 0.0e+00
## 49436 0 0 0 25873 0.0e+00
## 49437 0 0 0 0 0.0e+00
## 49438 28000000 0 0 0 1.7e+07
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 49433 0 0 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 0 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 49433 0 2686600 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 1587301 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 16000000 10000000 0 0 0
Mutate character variables into factors and dates
vc_data2 <- vc_data %>% mutate(category_list = as.factor(category_list), market = as.factor(market), status = as.factor(status), country_code = as.factor(country_code), state_code = as.factor(state_code), city = as.factor(city), founded_at = mdy(founded_at), first_funding_at = mdy(first_funding_at), last_funding_at = mdy(last_funding_at))
## Warning: 68 failed to parse.
## Warning: 10 failed to parse.
## Warning: 6 failed to parse.
head(vc_data2)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 2012-10
## 4 London London 1 2011-04-01 2011-04
## 5 Dallas Fort Worth 2 2014-01-01 2014-01
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 2012-Q4 2012 2012-08-09 2012-08-09 40000 0e+00
## 4 2011-Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 2014-Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 2011-Q4 2011 2013-05-31 2013-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 7000000 0 0 0
Identify date issues
# Identify dates where first funding occurs before company founded
vc_data2 %>% count(founded_at == first_funding_at)
## # A tibble: 3 x 2
## `founded_at == first_funding_at` n
## <lgl> <int>
## 1 FALSE 36133
## 2 TRUE 2347
## 3 NA 10958
vc_data2 %>% count(founded_at > first_funding_at)
## # A tibble: 3 x 2
## `founded_at > first_funding_at` n
## <lgl> <int>
## 1 FALSE 35576
## 2 TRUE 2904
## 3 NA 10958
fund_date_issue <- vector(mode = "logical", length = nrow(vc_data2))
for(i in 1:nrow(vc_data2)){
if(difftime(vc_data2[i,"founded_at"],vc_data2[i,"first_funding_at"], units = "days") > 0 & !is.na(vc_data2[i,"founded_at"]) & !is.na(vc_data2[i,"first_funding_at"])){
vc_data2[i,"first_funding_at"] = vc_data2[i,"founded_at"]
fund_date_issue[i] = TRUE
}
}
fund_date_issue2 <- data.frame(fund_date_issue)
vc_data3 <- cbind(vc_data2,fund_date_issue2)
head(vc_data3)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 2012-10
## 4 London London 1 2011-04-01 2011-04
## 5 Dallas Fort Worth 2 2014-01-01 2014-01
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 2012-Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 2011-Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 2014-Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 2011-Q4 2011 2013-05-31 2013-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
# Identify dates in the founded_at column and set their year values equal to the founded_year value
head(vc_data3 %>% arrange(desc(founded_at)))
## name category_list
## 1 Anacomp |Consulting|
## 2 Casey's General Stores |E-Commerce|
## 3 Gruppo Argenta |Recreation|Hospitality|
## 4 Healthcare Corporation of America |Biotechnology|
## 5 Matthew Walker Comprehensive Health Center |Nonprofits|
## 6 Netsmart Technologies |Enterprise Software|
## market funding_total_usd status country_code state_code
## 1 Consulting 75,00,000 acquired USA CA
## 2 E-Commerce 56,90,00,000 operating USA IA
## 3 Recreation 12,86,60,000 operating <NA> <NA>
## 4 Biotechnology 91,25,000 operating USA TN
## 5 Nonprofits 1,73,210 operating USA TN
## 6 Enterprise Software 45,00,000 operating USA NY
## region city funding_rounds founded_at founded_month
## 1 San Diego San Diego 1 2068-01-01 1968-01
## 2 Des Moines Ankeny 1 2068-01-01 1968-01
## 3 <NA> <NA> 1 2068-01-01 1968-01
## 4 Nashville Nashville 2 2068-01-01 1968-01
## 5 Nashville Nashville 1 2068-01-01 1968-01
## 6 NY - Other Great River 1 2068-01-01 1968-01
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 1968-Q1 1968 2068-01-01 2009-06-08 0 7500000
## 2 1968-Q1 1968 2068-01-01 2010-08-23 0 0
## 3 1968-Q1 1968 2068-01-01 2014-01-21 0 0
## 4 1968-Q1 1968 2068-01-01 2014-06-25 0 0
## 5 1968-Q1 1968 2068-01-01 2014-02-12 0 0
## 6 1968-Q1 1968 2068-01-01 2005-09-19 0 4500000
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0.00e+00 0 0
## 2 0 0 0 5.69e+08 0 0
## 3 0 0 0 0.00e+00 0 0
## 4 0 0 0 3.20e+06 0 0
## 5 0 0 0 0.00e+00 0 173210
## 6 0 0 0 0.00e+00 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 128660000 0 0 0
## 4 5925000 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 TRUE
## 2 0 0 0 0 0 0 TRUE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 TRUE
## 5 0 0 0 0 0 0 TRUE
## 6 0 0 0 0 0 0 TRUE
for(i in 1:nrow(vc_data3)){
if(!is.na(vc_data3[i,"founded_at"]) & vc_data3[i,"founded_at"] > 2014-12-13){
year(vc_data3[i,"founded_at"]) = vc_data3[i,"founded_year"]
year(vc_data3[i,"first_funding_at"]) = vc_data3[i,"founded_year"]
year(vc_data3[i,"last_funding_at"]) = vc_data3[i,"founded_year"]
}
}
head(vc_data3)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 2012-10
## 4 London London 1 2011-04-01 2011-04
## 5 Dallas Fort Worth 2 2014-01-01 2014-01
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 2012-Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 2011-Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 2014-Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 2011-Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
tail(vc_data3)
## name
## 49433 Zytoprotec
## 49434 Zzish
## 49435 ZZNode Science and Technology
## 49436 Zzzzapp Wireless ltd.
## 49437 [a]list games
## 49438 [x+1]
## category_list
## 49433 |Biotechnology|
## 49434 |Analytics|Gamification|Developer APIs|iOS|Android|Education|
## 49435 |Enterprise Software|
## 49436 |Web Development|Advertising|Wireless|Mobile|
## 49437 |Games|
## 49438 |Enterprise Software|
## market funding_total_usd status country_code state_code
## 49433 Biotechnology 26,86,600 operating AUT <NA>
## 49434 Education 3,20,000 operating GBR <NA>
## 49435 Enterprise Software 15,87,301 operating CHN <NA>
## 49436 Web Development 97,398 operating HRV <NA>
## 49437 Games 93,00,000 operating <NA> <NA>
## 49438 Enterprise Software 4,50,00,000 operating USA NY
## region city funding_rounds founded_at founded_month
## 49433 Vienna Gerasdorf Bei Wien 1 2007-01-01 2007-01
## 49434 London London 1 2013-01-28 2013-01
## 49435 Beijing Beijing 1 <NA> <NA>
## 49436 Split Split 5 2012-05-13 2012-05
## 49437 <NA> <NA> 1 <NA> <NA>
## 49438 New York City New York 4 1999-01-01 1999-01
## founded_quarter founded_year first_funding_at last_funding_at seed
## 49433 2007-Q1 2007 2007-01-29 2007-01-29 0
## 49434 2013-Q1 2013 2013-03-24 2013-03-24 320000
## 49435 <NA> NA 2012-04-01 2012-04-01 0
## 49436 2012-Q2 2012 2012-05-13 2012-09-10 71525
## 49437 <NA> NA 2011-11-21 2011-11-21 9300000
## 49438 1999-Q1 1999 1999-06-01 1999-04-04 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 49433 2686600 0 0 0 0.0e+00
## 49434 0 0 0 0 0.0e+00
## 49435 1587301 0 0 0 0.0e+00
## 49436 0 0 0 25873 0.0e+00
## 49437 0 0 0 0 0.0e+00
## 49438 28000000 0 0 0 1.7e+07
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 49433 0 0 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 0 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 49433 0 2686600 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 1587301 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 16000000 10000000 0 0 0
## fund_date_issue
## 49433 FALSE
## 49434 FALSE
## 49435 FALSE
## 49436 TRUE
## 49437 FALSE
## 49438 FALSE
Remove commas from total funding column and extract month and quarter from founded_month and founded_quarter columns
vc_data3$funding_total_usd<- as.numeric(gsub(",","",vc_data3$funding_total_usd))
## Warning: NAs introduced by coercion
vc_data4 <- vc_data3 %>% mutate(founded_month = as.integer(substr(founded_month,6,7)), founded_quarter = as.factor(substr(founded_quarter,6,7)))
head(vc_data4)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 6
## 2 Los Angeles Los Angeles 2 <NA> NA
## 3 Tallinn Tallinn 1 2012-10-26 10
## 4 London London 1 2011-04-01 4
## 5 Dallas Fort Worth 2 2014-01-01 1
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
Change founded_month integer variable to a factor with levels equal to the names of months
month <- vector(mode = "character", length = nrow(vc_data4))
for(i in 1:nrow(vc_data4)){
if(is.na(vc_data4[i,"founded_month"])){
month[i] = NA
}else if(vc_data4[i,"founded_month"] == 1){
month[i] = "January"
} else if(vc_data4[i,"founded_month"] == 2){
month[i] = "February"
} else if(vc_data4[i,"founded_month"] == 3){
month[i] = "March"
}else if(vc_data4[i,"founded_month"] == 4){
month[i] = "April"
}else if(vc_data4[i,"founded_month"] == 5){
month[i] = "May"
}else if(vc_data4[i,"founded_month"] == 6){
month[i] = "June"
}else if(vc_data4[i,"founded_month"] == 7){
month[i] = "July"
}else if(vc_data4[i,"founded_month"] == 8){
month[i] = "August"
}else if(vc_data4[i,"founded_month"] == 9){
month[i] = "September"
}else if(vc_data4[i,"founded_month"] == 10){
month[i] = "October"
}else if(vc_data4[i,"founded_month"] == 11){
month[i] = "November"
}else if(vc_data4[i,"founded_month"] == 12){
month[i] = "December"
}
}
for(i in 1:nrow(vc_data4)){
vc_data4[i,12] = month[i]
}
head(vc_data4)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
vc_data5<- vc_data4 %>% mutate(founded_month = as.factor(founded_month))
head(vc_data5)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
Group related market variable categories together to reduce the number of categories
markets2 <- vector(mode = "character", length = nrow(vc_data5))
markets3 <- factor(markets2,levels = c("Internet", "Big Data Analytics/Security", "Apps/Social Media", "Energy","Health/Medicine/Biotech", "Finance","Other"))
vc_data_clean <- cbind(vc_data5,markets3)
head(vc_data_clean)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
## markets3
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
for(i in 1:nrow(vc_data5)){
if(is.na(vc_data5[i,"market"])){
vc_data_clean[i,"markets3"] = NA
} else if(str_detect(vc_data5[i,"market"],"Web") | str_detect(vc_data5[i,"market"],"Internet") | str_detect(vc_data5[i,"market"],"Cloud") | str_detect(vc_data5[i,"market"],"IT") | str_detect(vc_data5[i,"market"],"E-Commerce") | str_detect(vc_data5[i,"market"],"Online") | str_detect(vc_data5[i,"market"],"Information") | str_detect(vc_data5[i,"market"],"Enterprise")){
vc_data_clean[i,"markets3"] = "Internet"
} else if(str_detect(vc_data5[i,"market"],"Augmented Reality") | str_detect(vc_data5[i,"market"],"Data") | str_detect(vc_data5[i,"market"],"Artificial Intelligence") | str_detect(vc_data5[i,"market"],"Cyber") | str_detect(vc_data5[i,"market"],"Security") | str_detect(vc_data5[i,"market"],"Machine Learning") | str_detect(vc_data5[i,"market"],"Analytics") | str_detect(vc_data5[i,"market"],"Algorithms")){
vc_data_clean[i,"markets3"] = "Big Data Analytics/Security"
} else if(str_detect(vc_data5[i,"market"],"Mobile") | str_detect(vc_data5[i,"market"],"Social")){
vc_data_clean[i,"markets3"] = "Apps/Social Media"
} else if(str_detect(vc_data5[i,"market"],"Energy") | str_detect(vc_data5[i,"market"],"Biofeuls") | str_detect(vc_data5[i,"market"],"Biomass") | str_detect(vc_data5[i,"market"],"Clean") | str_detect(vc_data5[i,"market"],"Green") | str_detect(vc_data5[i,"market"],"Natural") | str_detect(vc_data5[i,"market"],"Oil") | str_detect(vc_data5[i,"market"],"Fuels") | str_detect(vc_data5[i,"market"],"Renewable")){
vc_data_clean[i,"markets3"] = "Energy"
} else if(str_detect(vc_data5[i,"market"],"Health") | str_detect(vc_data5[i,"market"],"Biometrics") | str_detect(vc_data5[i,"market"],"Biotechnology") | str_detect(vc_data5[i,"market"],"Medical")){
vc_data_clean[i,"markets3"] = "Health/Medicine/Biotech"
} else if(str_detect(vc_data5[i,"market"],"Finance") | str_detect(vc_data5[i,"market"],"Financial") | str_detect(vc_data5[i,"market"],"Banking") | str_detect(vc_data5[i,"market"],"Bitcoin") | str_detect(vc_data5[i,"market"],"P2P")){
vc_data_clean[i,"markets3"] = "Finance"
} else {
vc_data_clean[i,"markets3"] = "Other"
}
}
head(vc_data_clean)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
## markets3
## 1 Other
## 2 Other
## 3 Other
## 4 Other
## 5 Other
## 6 Other
Create categorical variable for numerical venture variable
vc_data_clean2 <- vc_data_clean %>% filter(fund_date_issue != TRUE, !is.na(venture), !is.na(first_funding_at), venture > 0)
vc_fund_cats1 <- vc_data_clean2$venture
range(vc_data_clean2$venture)
## [1] 2.200e+01 2.351e+09
vc_data_clean2$venture_cats <- cut(vc_fund_cats1, 20, labels = c("0 to 117 million", "117 to 235 million","235 to 352 million","352 to 470 million","470 to 588 million","588 to 705 million","705 to 823 million","823 to 940 million","940 million to 1 billion","1 to 1.175 billion","1.17 to 1.3 billion","1.3 to 1.4 billion","1.4 to 1.53 billion","1.53 to 1.64 billion","1.64 to 1.76 billion", "1.76 to 1.88 billion", "1.88 to 1.99 billion","1.99 to 2.11 billion","2.11 to 2.23 billion","2.23 to 2.35 billion"))
head(vc_data_clean2)
## name category_list market
## 1 &TV Communications |Games| Games
## 2 .Club Domains |Software| Software
## 3 0-6.com |Curated Web| Curated Web
## 4 10 Minutes With |Education| Education
## 5 1000museums.com |Curated Web| Curated Web
## 6 1001 Menus |Local Businesses|Restaurants| Restaurants
## funding_total_usd status country_code state_code region
## 1 4000000 operating USA CA Los Angeles
## 2 7000000 <NA> USA FL Ft. Lauderdale
## 3 2000000 operating <NA> <NA> <NA>
## 4 4400000 operating GBR <NA> London
## 5 4962651 operating USA WA Seattle
## 6 4059079 operating FRA <NA> Paris
## city funding_rounds founded_at founded_month founded_quarter
## 1 Los Angeles 2 <NA> <NA> <NA>
## 2 Oakland Park 1 2011-10-10 October Q4
## 3 <NA> 1 2007-01-01 January Q1
## 4 London 2 2013-01-01 January Q1
## 5 Bellevue 6 2008-01-01 January Q1
## 6 Paris 4 2010-11-20 November Q4
## founded_year first_funding_at last_funding_at seed venture
## 1 NA 2010-06-04 2010-09-23 0 4000000
## 2 2011 2011-05-31 2011-05-31 0 7000000
## 3 2007 2007-03-19 2007-03-19 0 2000000
## 4 2013 2013-01-01 2013-10-09 400000 4000000
## 5 2008 2008-10-14 2008-09-19 0 3814772
## 6 2010 2010-12-15 2010-11-13 522169 3536910
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 1147879 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 7000000 0 0 0 FALSE
## 3 0 2000000 0 0 0 0 FALSE
## 4 0 4000000 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 3536910 0 0 0 0 FALSE
## markets3 venture_cats
## 1 Other 0 to 117 million
## 2 Other 0 to 117 million
## 3 Internet 0 to 117 million
## 4 Other 0 to 117 million
## 5 Internet 0 to 117 million
## 6 Other 0 to 117 million
Take a sample of 500 observations of venture funding and adjust the nominal venture funding for inflation such that funding from any time period is expressed in 2014 prices
real_venture<-vector(mode = "numeric", length = nrow(vc_data_clean2))
vc_data_clean2 <- cbind(vc_data_clean2,real_venture)
head(vc_data_clean2)
## name category_list market
## 1 &TV Communications |Games| Games
## 2 .Club Domains |Software| Software
## 3 0-6.com |Curated Web| Curated Web
## 4 10 Minutes With |Education| Education
## 5 1000museums.com |Curated Web| Curated Web
## 6 1001 Menus |Local Businesses|Restaurants| Restaurants
## funding_total_usd status country_code state_code region
## 1 4000000 operating USA CA Los Angeles
## 2 7000000 <NA> USA FL Ft. Lauderdale
## 3 2000000 operating <NA> <NA> <NA>
## 4 4400000 operating GBR <NA> London
## 5 4962651 operating USA WA Seattle
## 6 4059079 operating FRA <NA> Paris
## city funding_rounds founded_at founded_month founded_quarter
## 1 Los Angeles 2 <NA> <NA> <NA>
## 2 Oakland Park 1 2011-10-10 October Q4
## 3 <NA> 1 2007-01-01 January Q1
## 4 London 2 2013-01-01 January Q1
## 5 Bellevue 6 2008-01-01 January Q1
## 6 Paris 4 2010-11-20 November Q4
## founded_year first_funding_at last_funding_at seed venture
## 1 NA 2010-06-04 2010-09-23 0 4000000
## 2 2011 2011-05-31 2011-05-31 0 7000000
## 3 2007 2007-03-19 2007-03-19 0 2000000
## 4 2013 2013-01-01 2013-10-09 400000 4000000
## 5 2008 2008-10-14 2008-09-19 0 3814772
## 6 2010 2010-12-15 2010-11-13 522169 3536910
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 1147879 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 7000000 0 0 0 FALSE
## 3 0 2000000 0 0 0 0 FALSE
## 4 0 4000000 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 3536910 0 0 0 0 FALSE
## markets3 venture_cats real_venture
## 1 Other 0 to 117 million 0
## 2 Other 0 to 117 million 0
## 3 Internet 0 to 117 million 0
## 4 Other 0 to 117 million 0
## 5 Internet 0 to 117 million 0
## 6 Other 0 to 117 million 0
set.seed(123)
vc_data_clean3 <- sample_n(vc_data_clean2,500)
for(i in 1:nrow(data)){
for(j in 1:nrow(vc_data_clean3)){
if(!is.na(vc_data_clean3[j,"first_funding_at"]) & !is.na(vc_data_clean3[j,"venture"]) & difftime(vc_data_clean3[j,"first_funding_at"], data[i,"Date"], units = "days") >= 0 & difftime(vc_data_clean3[j,"first_funding_at"], data[i,"Date"], units = "days") < 31){
vc_data_clean3[j,"real_venture"] = vc_data_clean3[j,"venture"]*(data[1213,"Index"]/data[i,"Index"])
}
}
}
head(vc_data_clean3)
## name category_list
## 1 Sunbeam |Biotechnology|
## 2 SuperOx Wastewater Co |Manufacturing|
## 3 Brain Tunnelgenix Technologies |Biotechnology|
## 4 Audioscribe <NA>
## 5 Canopy Labs |Lead Generation|Analytics|
## 6 Manta |Professional Networking|Curated Web|
## market funding_total_usd status country_code state_code
## 1 Biotechnology 9329636 operating USA FL
## 2 Manufacturing 1700000 operating USA TX
## 3 Biotechnology 2563168 operating USA CT
## 4 <NA> 1499800 operating <NA> <NA>
## 5 Lead Generation 2064000 <NA> USA CA
## 6 Curated Web 47215715 operating USA OH
## region city funding_rounds founded_at founded_month
## 1 Ft. Lauderdale Fort Lauderdale 1 2007-01-01 January
## 2 Houston Houston 1 2010-01-01 January
## 3 Hartford Bridgeport 3 2006-01-01 January
## 4 <NA> <NA> 2 <NA> <NA>
## 5 SF Bay Area San Francisco 2 2012-01-01 January
## 6 Columbus, Ohio Columbus 2 2005-09-01 September
## founded_quarter founded_year first_funding_at last_funding_at seed
## 1 Q1 2007 2007-12-11 2007-12-11 0
## 2 Q1 2010 2010-05-24 2010-05-24 0
## 3 Q1 2006 2006-04-16 2006-09-17 100000
## 4 <NA> NA 2012-07-01 2012-10-01 0
## 5 Q1 2012 2012-12-13 2012-03-21 1500000
## 6 Q3 2005 2005-01-04 2005-04-02 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 1 9329636 0 0 0 0
## 2 1700000 0 0 0 0
## 3 2213168 0 0 0 0
## 4 1499800 0 0 0 0
## 5 564000 0 0 0 0
## 6 47215715 0 0 0 0
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 250000 0 0 0 0 0 FALSE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 0 0 0 0 FALSE
## markets3 venture_cats real_venture
## 1 Health/Medicine/Biotech 0 to 117 million 10390367.1
## 2 Other 0 to 117 million 1822627.4
## 3 Health/Medicine/Biotech 0 to 117 million 2569208.0
## 4 <NA> 0 to 117 million 1528805.4
## 5 Other 0 to 117 million 574599.5
## 6 Internet 0 to 117 million 57915632.9
vc_fund_cats1 <- vc_data_clean3$real_venture
range(vc_data_clean3$real_venture)
## [1] 0 319412108
vc_data_clean3$venture_cats <- cut(vc_fund_cats1, 10, labels = c("0 to 32 million", "32 to 64 million","64 to 96 million","96 to 128 million","128 to 160 million","160 to 192 million","192 to 224 million","224 to 256 million","256 to 288 million","288 to 320 million"))
head(vc_data_clean3)
## name category_list
## 1 Sunbeam |Biotechnology|
## 2 SuperOx Wastewater Co |Manufacturing|
## 3 Brain Tunnelgenix Technologies |Biotechnology|
## 4 Audioscribe <NA>
## 5 Canopy Labs |Lead Generation|Analytics|
## 6 Manta |Professional Networking|Curated Web|
## market funding_total_usd status country_code state_code
## 1 Biotechnology 9329636 operating USA FL
## 2 Manufacturing 1700000 operating USA TX
## 3 Biotechnology 2563168 operating USA CT
## 4 <NA> 1499800 operating <NA> <NA>
## 5 Lead Generation 2064000 <NA> USA CA
## 6 Curated Web 47215715 operating USA OH
## region city funding_rounds founded_at founded_month
## 1 Ft. Lauderdale Fort Lauderdale 1 2007-01-01 January
## 2 Houston Houston 1 2010-01-01 January
## 3 Hartford Bridgeport 3 2006-01-01 January
## 4 <NA> <NA> 2 <NA> <NA>
## 5 SF Bay Area San Francisco 2 2012-01-01 January
## 6 Columbus, Ohio Columbus 2 2005-09-01 September
## founded_quarter founded_year first_funding_at last_funding_at seed
## 1 Q1 2007 2007-12-11 2007-12-11 0
## 2 Q1 2010 2010-05-24 2010-05-24 0
## 3 Q1 2006 2006-04-16 2006-09-17 100000
## 4 <NA> NA 2012-07-01 2012-10-01 0
## 5 Q1 2012 2012-12-13 2012-03-21 1500000
## 6 Q3 2005 2005-01-04 2005-04-02 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 1 9329636 0 0 0 0
## 2 1700000 0 0 0 0
## 3 2213168 0 0 0 0
## 4 1499800 0 0 0 0
## 5 564000 0 0 0 0
## 6 47215715 0 0 0 0
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 250000 0 0 0 0 0 FALSE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 0 0 0 0 FALSE
## markets3 venture_cats real_venture
## 1 Health/Medicine/Biotech 0 to 32 million 10390367.1
## 2 Other 0 to 32 million 1822627.4
## 3 Health/Medicine/Biotech 0 to 32 million 2569208.0
## 4 <NA> 0 to 32 million 1528805.4
## 5 Other 0 to 32 million 574599.5
## 6 Internet 32 to 64 million 57915632.9
Data Analysis
Proportion of Companies by Funding Rounds, Status, and Funding Amount
# Is the proportion of companies by status equal?
status_prop <- vc_data_clean %>% group_by(status) %>% summarise(num_comps = length(status)) %>% mutate(prop_comps = num_comps/sum(num_comps))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
status_prop
## # A tibble: 4 x 3
## status num_comps prop_comps
## <fct> <int> <dbl>
## 1 acquired 3692 0.0747
## 2 closed 2603 0.0527
## 3 operating 41829 0.846
## 4 <NA> 1314 0.0266
# Is the proportion of companies by funding amount equal?
fund_prop <- vc_data_clean2 %>% group_by(venture_cats) %>% summarise(num_comps = length(status)) %>% mutate(prop_comps = num_comps/sum(num_comps))
fund_prop
## # A tibble: 10 x 3
## venture_cats num_comps prop_comps
## <fct> <int> <dbl>
## 1 0 to 117 million 22154 0.986
## 2 117 to 235 million 245 0.0109
## 3 235 to 352 million 38 0.00169
## 4 352 to 470 million 11 0.000490
## 5 470 to 588 million 5 0.000223
## 6 588 to 705 million 6 0.000267
## 7 705 to 823 million 5 0.000223
## 8 1.17 to 1.3 billion 1 0.0000445
## 9 1.4 to 1.53 billion 1 0.0000445
## 10 2.23 to 2.35 billion 1 0.0000445
# Is the proportion of companies by funding round equal?
fund_rnd_prop <- vc_data_clean %>% group_by(funding_rounds) %>% summarise(num_comps = length(funding_rounds)) %>% mutate(prop_comps = num_comps/sum(num_comps))
fund_rnd_prop
## # A tibble: 17 x 3
## funding_rounds num_comps prop_comps
## <int> <int> <dbl>
## 1 1 32039 0.648
## 2 2 9219 0.186
## 3 3 4026 0.0814
## 4 4 1997 0.0404
## 5 5 1001 0.0202
## 6 6 560 0.0113
## 7 7 252 0.00510
## 8 8 152 0.00307
## 9 9 84 0.00170
## 10 10 43 0.000870
## 11 11 35 0.000708
## 12 12 12 0.000243
## 13 13 8 0.000162
## 14 14 4 0.0000809
## 15 15 4 0.0000809
## 16 16 1 0.0000202
## 17 18 1 0.0000202
Mean Venture Funding and Mean Funding Rounds by Month Founded & Mean Funding Rounds by Funding Amount
# Is mean venture capital funding the same across every month?
venture_fund_month <-vc_data_clean %>% group_by(founded_month) %>% filter(!is.na(venture)) %>% summarise(mean_funding = mean(venture), sum_funding = sum(venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `founded_month` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_fund_month
## # A tibble: 13 x 4
## founded_month mean_funding sum_funding prop_funding
## <fct> <dbl> <dbl> <dbl>
## 1 January 10009150. 224665377728 0.606
## 2 September 6794291. 10232202539 0.0276
## 3 March 6358114. 10389158221 0.0280
## 4 November 6295164. 8190008518 0.0221
## 5 October 6013991. 8828538255 0.0238
## 6 February 5659564. 8302580255 0.0224
## 7 July 5603945. 7867939147 0.0212
## 8 August 5374815. 7535490177 0.0203
## 9 June 5219426. 8695564226 0.0234
## 10 <NA> 5185733. 56814885564 0.153
## 11 May 5048103. 7784174106 0.0210
## 12 December 4770146. 5237620651 0.0141
## 13 April 4065502. 6293397023 0.0170
ggplot(venture_fund_month) + geom_col(mapping = aes(x = founded_month, y = mean_funding)) + coord_flip()

# Is the mean number of funding rounds the same across every month?
fund_rnd_month <- vc_data_clean %>% group_by(founded_month) %>% filter(!is.na(funding_rounds)) %>% summarise(mean_rnds = mean(funding_rounds), sum_rnds = sum(funding_rounds)) %>% mutate(prop_rnds = sum_rnds/sum(sum_rnds))
## Warning: Factor `founded_month` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_month
## # A tibble: 13 x 4
## founded_month mean_rnds sum_rnds prop_rnds
## <fct> <dbl> <int> <dbl>
## 1 April 1.64 2542 0.0303
## 2 August 1.71 2396 0.0286
## 3 December 1.72 1893 0.0226
## 4 February 1.67 2450 0.0292
## 5 January 1.86 41837 0.499
## 6 July 1.70 2381 0.0284
## 7 June 1.71 2856 0.0341
## 8 March 1.71 2791 0.0333
## 9 May 1.70 2617 0.0312
## 10 November 1.75 2279 0.0272
## 11 October 1.69 2474 0.0295
## 12 September 1.66 2507 0.0299
## 13 <NA> 1.35 14834 0.177
ggplot(fund_rnd_month) + geom_col(mapping = aes(x = founded_month, y = mean_rnds))

# Is the mean number of funding rounds the same across all funding amounts?
fund_rnd_funding <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(!is.na(funding_rounds)) %>% summarise(mean_rnds = mean(funding_rounds), sum_rnds = sum(funding_rounds)) %>% mutate(prop_rnds = sum_rnds/sum(sum_rnds))
fund_rnd_funding
## # A tibble: 10 x 4
## venture_cats mean_rnds sum_rnds prop_rnds
## <fct> <dbl> <int> <dbl>
## 1 0 to 117 million 2.10 46446 0.966
## 2 117 to 235 million 5.16 1265 0.0263
## 3 235 to 352 million 5.05 192 0.00399
## 4 352 to 470 million 4.45 49 0.00102
## 5 470 to 588 million 8 40 0.000832
## 6 588 to 705 million 5.67 34 0.000707
## 7 705 to 823 million 7.4 37 0.000769
## 8 1.17 to 1.3 billion 8 8 0.000166
## 9 1.4 to 1.53 billion 6 6 0.000125
## 10 2.23 to 2.35 billion 11 11 0.000229
ggplot(fund_rnd_funding) + geom_col(mapping = aes(x = venture_cats, y = mean_rnds))

Mean Venture Funding and Funding Rounds by Status from 1913 to 1990 and from 1990 to 2014
# Does mean real venture funding by status vary from the 1980's onward?
venture_status1 <- vc_data_clean3 %>% group_by(status) %>% filter(!is.na(real_venture), founded_year < 1990-01-01) %>% summarise(mean_venture = mean(real_venture), sum_venture = sum(real_venture)) %>% mutate(prop_venture = sum_venture/sum(sum_venture))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_status1
## # A tibble: 3 x 4
## status mean_venture sum_venture prop_venture
## <fct> <dbl> <dbl> <dbl>
## 1 acquired 43917321. 87834642. 0.724
## 2 closed 13678213. 13678213. 0.113
## 3 operating 3970226. 19851130. 0.164
venture_status2 <- vc_data_clean3 %>% group_by(status) %>% filter(!is.na(real_venture), founded_year > 1990-01-01) %>% summarise(mean_venture = mean(real_venture), sum_venture = sum(as.numeric(real_venture))) %>% mutate(prop_venture = sum_venture/sum(sum_venture))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_status2
## # A tibble: 4 x 4
## status mean_venture sum_venture prop_venture
## <fct> <dbl> <dbl> <dbl>
## 1 acquired 18859124. 697787587. 0.0955
## 2 closed 5112920. 76693795. 0.0105
## 3 operating 20876276. 6367264069. 0.872
## 4 <NA> 13578712. 162944541. 0.0223
ggplot(venture_status1) + geom_col(mapping = aes(x = status, y = mean_venture))

ggplot(venture_status2) + geom_col(mapping = aes(x = status, y = mean_venture))

# Does the mean number of funding rounds vary from the 1980's onward?
fund_status_rnd_time1 <- vc_data_clean %>% group_by(status) %>% filter(!is.na(funding_rounds), founded_at < 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_funding_rnds = sum(funding_rounds)) %>% mutate(prop_funding = sum_funding_rnds/sum(sum_funding_rnds))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_status_rnd_time1
## # A tibble: 4 x 4
## status mean_fund_rnd sum_funding_rnds prop_funding
## <fct> <dbl> <int> <dbl>
## 1 acquired 1.16 22 0.0603
## 2 closed 1 7 0.0192
## 3 operating 1.31 319 0.874
## 4 <NA> 1 17 0.0466
fund_status_rnd_time2 <- vc_data_clean %>% group_by(status) %>% filter(!is.na(funding_rounds), founded_at > 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_funding_rnds = sum(funding_rounds)) %>% mutate(prop_funding = sum_funding_rnds/sum(sum_funding_rnds))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_status_rnd_time2
## # A tibble: 4 x 4
## status mean_fund_rnd sum_funding_rnds prop_funding
## <fct> <dbl> <int> <dbl>
## 1 acquired 2.12 6268 0.0913
## 2 closed 1.45 2885 0.0420
## 3 operating 1.79 58008 0.845
## 4 <NA> 1.66 1497 0.0218
ggplot(fund_status_rnd_time1) + geom_col(mapping = aes(x = status, y = mean_fund_rnd))

ggplot(fund_status_rnd_time2) + geom_col(mapping = aes(x = status, y = mean_fund_rnd))

Mean Venture Funding and Funding Rounds by Market from 1913 to 1990 and from 1990 to 2014
# Does mean real venture funding (venture funding adjusted for inflation) by market vary from the 1980's onward?
real_venture_market_time1 <-vc_data_clean3 %>% group_by(markets3) %>% filter(!is.na(real_venture), founded_year < 1990) %>% summarise(mean_funding = mean(real_venture), sum_funding = sum(real_venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
real_venture_market_time1
## # A tibble: 4 x 4
## markets3 mean_funding sum_funding prop_funding
## <fct> <dbl> <dbl> <dbl>
## 1 Big Data Analytics/Security 18623885. 18623885. 0.133
## 2 Other 17798893. 106793360. 0.763
## 3 Health/Medicine/Biotech 13678213. 13678213. 0.0977
## 4 Energy 892412. 892412. 0.00637
real_venture_market_time2 <-vc_data_clean3 %>% group_by(markets3) %>% filter(!is.na(real_venture), founded_year > 1990) %>% summarise(mean_funding = mean(real_venture), sum_funding = sum(real_venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
real_venture_market_time2
## # A tibble: 8 x 4
## markets3 mean_funding sum_funding prop_funding
## <fct> <dbl> <dbl> <dbl>
## 1 Finance 31353700. 313536996. 0.0440
## 2 Energy 30286308. 424008306. 0.0596
## 3 Health/Medicine/Biotech 28264322. 1582802055. 0.222
## 4 Internet 19775608. 1285414501. 0.181
## 5 Big Data Analytics/Security 19613363. 274587084. 0.0386
## 6 Other 16866863. 2951700952. 0.415
## 7 Apps/Social Media 10937640. 262503366. 0.0369
## 8 <NA> 3127957. 25023657. 0.00351
ggplot(real_venture_market_time1) + geom_col(mapping = aes(x = markets3, y = mean_funding))

ggplot(real_venture_market_time2) + geom_col(mapping = aes(x = markets3, y = mean_funding))

# Do the markets that recieve the most funding rounds on average vary over time?
fund_rnd_market_time1 <-vc_data_clean %>% group_by(markets3) %>% filter(!is.na(funding_rounds), founded_at < 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_fund_rnd = sum(funding_rounds)) %>% mutate(prop_funding = sum_fund_rnd/sum(sum_fund_rnd)) %>% arrange(desc(mean_fund_rnd))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_market_time1
## # A tibble: 8 x 4
## markets3 mean_fund_rnd sum_fund_rnd prop_funding
## <fct> <dbl> <int> <dbl>
## 1 Big Data Analytics/Security 2.2 11 0.0301
## 2 Internet 1.55 31 0.0849
## 3 Finance 1.5 6 0.0164
## 4 Energy 1.38 11 0.0301
## 5 Other 1.27 206 0.564
## 6 Health/Medicine/Biotech 1.26 43 0.118
## 7 <NA> 1.06 55 0.151
## 8 Apps/Social Media 1 2 0.00548
ggplot(fund_rnd_market_time1) + geom_col(mapping = aes(x = markets3, y = mean_fund_rnd)) + coord_flip()

fund_rnd_market_time2 <-vc_data_clean %>% group_by(markets3) %>% filter(!is.na(funding_rounds), founded_at > 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_fund_rnd = sum(funding_rounds)) %>% mutate(prop_funding = sum_fund_rnd/sum(sum_fund_rnd)) %>% arrange(desc(mean_fund_rnd))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_market_time2
## # A tibble: 8 x 4
## markets3 mean_fund_rnd sum_fund_rnd prop_funding
## <fct> <dbl> <int> <dbl>
## 1 Health/Medicine/Biotech 2.16 9328 0.136
## 2 Big Data Analytics/Security 2.07 2965 0.0432
## 3 Energy 2.06 1880 0.0274
## 4 Apps/Social Media 1.82 5520 0.0804
## 5 Internet 1.80 9850 0.143
## 6 Finance 1.80 1635 0.0238
## 7 Other 1.75 35321 0.514
## 8 <NA> 1.12 2159 0.0314
ggplot(fund_rnd_market_time2) + geom_col(mapping = aes(x = markets3, y = mean_fund_rnd)) + coord_flip()

Post-IPO Equity Value Among Operating Companies and Number of IPO’s Among Companies
head(vc_data_clean %>% arrange(desc(post_ipo_equity)))
## name
## 1 Clearwire
## 2 Charter Communications
## 3 Youku
## 4 Xerox
## 5 AOL
## 6 BlackBerry
## category_list
## 1 |Internet|Mobile|
## 2 <NA>
## 3 |Technology|Photography|
## 4 |Hardware + Software|
## 5 |Advertising Platforms|Content Creators|Digital Media|News|
## 6 |RIM|Wireless|Mobile|Hardware + Software|
## market funding_total_usd status country_code state_code
## 1 Internet 5700000000 acquired USA WA
## 2 <NA> 5162513431 operating USA CT
## 3 Technology 1320000000 operating CHN <NA>
## 4 Hardware + Software 1100000000 operating USA AL
## 5 Content Creators 1000000000 operating USA NY
## 6 Wireless 1000000000 operating CAN ON
## region city funding_rounds founded_at founded_month
## 1 Seattle Kirkland 4 2003-10-01 October
## 2 Hartford Stamford 2 1993-01-01 January
## 3 Shanghai Shanghai 6 2006-12-21 December
## 4 AL - Other Normal 1 1906-01-01 January
## 5 New York City New York 1 1985-05-24 May
## 6 Toronto Waterloo 1 1984-01-01 January
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q4 2003 2003-05-06 2003-02-27 0 0.0e+00
## 2 Q1 1993 1993-11-21 1993-09-15 0 0.0e+00
## 3 Q4 2006 2006-12-21 2006-04-28 0 1.2e+08
## 4 Q1 1906 1906-03-15 1906-03-15 0 0.0e+00
## 5 Q2 1985 1985-12-01 1985-12-01 0 0.0e+00
## 6 Q1 1984 1984-11-04 1984-11-04 0 0.0e+00
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 8e+07 4700000000 9.2e+08 0
## 2 0e+00 1662513431 3.5e+09 0
## 3 0e+00 1200000000 0.0e+00 0
## 4 0e+00 1100000000 0.0e+00 0
## 5 0e+00 1000000000 0.0e+00 0
## 6 0e+00 1000000000 0.0e+00 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 3000000 12000000 25000000 40000000 40000000
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## fund_date_issue markets3
## 1 FALSE Internet
## 2 FALSE <NA>
## 3 TRUE Other
## 4 FALSE Other
## 5 FALSE Other
## 6 FALSE Other
tapply(vc_data_clean$post_ipo_equity,vc_data_clean$status,length)
## acquired closed operating
## 3692 2603 41829
## Are companies that receieve more funding rounds on average more likely to issue an IPO?
fund_rnd_num_ipo <- vc_data_clean %>% group_by(funding_rounds) %>% filter(!is.na(post_ipo_equity), post_ipo_equity > 0) %>% summarise(num_ipos = length(post_ipo_equity))
fund_rnd_num_ipo
## # A tibble: 12 x 2
## funding_rounds num_ipos
## <int> <int>
## 1 1 139
## 2 2 62
## 3 3 39
## 4 4 19
## 5 5 18
## 6 6 19
## 7 7 7
## 8 8 2
## 9 9 5
## 10 10 3
## 11 11 2
## 12 14 1
## Are companies that recieve more venture funding on average more likely to issue an IPO?
venture_num_ipo <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(!is.na(post_ipo_equity), post_ipo_equity > 0) %>% summarise(num_ipos = length(post_ipo_equity))
venture_num_ipo
## # A tibble: 3 x 2
## venture_cats num_ipos
## <fct> <int>
## 1 0 to 117 million 109
## 2 117 to 235 million 6
## 3 235 to 352 million 1
## Are operating companies that recieve more funding rounds on average more likely to have a higher post-IPO value?
op_ipo_market_time2 <- vc_data_clean %>% group_by(funding_rounds) %>% filter(status == "operating", !is.na(post_ipo_equity), post_ipo_equity >0) %>% summarise(mean_ipo_equity = mean(post_ipo_equity), max_ipo_equity = max(post_ipo_equity), min_ipo_equity = min(post_ipo_equity))
op_ipo_market_time2
## # A tibble: 12 x 4
## funding_rounds mean_ipo_equity max_ipo_equity min_ipo_equity
## <int> <dbl> <dbl> <dbl>
## 1 1 100229068. 1100000000 10500
## 2 2 84592892. 1662513431 119238
## 3 3 36121746. 211000000 800000
## 4 4 74293737 478000000 1000000
## 5 5 69530041. 264000000 2300000
## 6 6 181599171. 1200000000 3467747
## 7 7 65529415. 186000000 1303000
## 8 8 6962500 12500000 1425000
## 9 9 38406000 74700000 630000
## 10 10 90353596. 166228809 353500
## 11 11 27899452. 40000000 15798903
## 12 14 2000000 2000000 2000000
## Are operating companies that recieve more venture funding on average more likely to have a higher mean post-IPO value?
op_ipo_market_time1 <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(status == "operating", !is.na(post_ipo_equity), post_ipo_equity > 0, !is.na(venture)) %>% summarise(mean_ipo_equity = mean(post_ipo_equity), max_ipo_equity = max(post_ipo_equity), min_ipo_equity = min(post_ipo_equity))
op_ipo_market_time1
## # A tibble: 3 x 4
## venture_cats mean_ipo_equity max_ipo_equity min_ipo_equity
## <fct> <dbl> <dbl> <dbl>
## 1 0 to 117 million 57747995. 739265776 119238
## 2 117 to 235 million 188618915. 736000000 40000000
## 3 235 to 352 million 104478480 104478480 104478480
Subsetting the data to do a microanalysis of the Biotechnology market.
# unique(vc_data4$market)
Biotech <- vc_data_clean %>%
select(c(1, 3, 4, 5, 6, 7, 9, 12, 14)) %>%
filter(market == " Biotechnology ")
head(Biotech)
## name market funding_total_usd status
## 1 10X Technologies Biotechnology 3000000 operating
## 2 20/20 Gene Systems Inc. Biotechnology 150000 operating
## 3 22nd Century Group Biotechnology 15034750 operating
## 4 23andMe Biotechnology 111949900 operating
## 5 3-V Biosciences Biotechnology 78089000 operating
## 6 360imaging Biotechnology 1000000 operating
## country_code state_code city founded_month founded_year
## 1 USA CA Oakland January 2012
## 2 USA MD Rockville May 2000
## 3 USA NY Clarence January 1998
## 4 USA CA Mountain View April 2006
## 5 USA CA Menlo Park March 2007
## 6 USA GA Atlanta January 2005
view(Biotech)
The plot below is meant to visualize the status of the biotechnology companies.
plt <- Biotech %>%
ggplot()+
geom_col(aes(x = status, y = nrow(Biotech), fill = status))+
ylab("Number of Companies")+
xlab("Operational status")
plt

The below code further subsets the data to specifically look at the number of biotechnology companies that were founded per year. It also showed that the years are not continuous.
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
Comp_year <- count(Biotech, "founded_year") %>%
rename(c("freq" = "total_comp_founded"))
# view(Comp_year)
Comp_year <- Comp_year[-c(53), ]
summary(Comp_year)
## founded_year total_comp_founded
## Min. :1912 Min. : 1.00
## 1st Qu.:1976 1st Qu.: 1.75
## Median :1988 Median : 9.50
## Mean :1983 Mean : 47.37
## 3rd Qu.:2001 3rd Qu.: 89.25
## Max. :2014 Max. :212.00
plt2 <- Comp_year %>%
ggplot()+
geom_point(aes(x = founded_year, y = total_comp_founded))+
xlab("Year")+
ylab("Companies Founded")
plt2

Similarly, this code looks at the number of companies founded per country. It is interesting to note that the United States far exceeds that of other countries.
Comp_country <- count(Biotech, "country_code") %>%
rename(c("freq" = "total_founded"))
# view(Comp_country)
summary(Comp_country)
## country_code total_founded
## ARE : 1 Min. : 1.00
## ARG : 1 1st Qu.: 1.00
## AUS : 1 Median : 6.00
## AUT : 1 Mean : 83.82
## BEL : 1 3rd Qu.: 27.00
## (Other):38 Max. :2716.00
## NA's : 1
plt3 <- Comp_country %>%
ggplot()+
geom_col(aes(x = country_code, y = total_founded, fill = country_code))+
coord_flip()+
xlab("Country")+
ylab("Companies Founded")
plt3

This code does the same as that above, except that it shows number of biotechnology companies founded per state. This was interesting because it showed that the ‘state_code’ column of the dataset also contains Canadian provinces. Therefore, it is more accurate to say that the below shows a look at companies found in North America.
Comp_state <- count(Biotech, "state_code") %>%
rename(c("freq" = "total_founded"))
# view(Comp_state)
summary(Comp_state)
## state_code total_founded
## AB : 1 Min. : 1.0
## AK : 1 1st Qu.: 7.0
## AL : 1 Median : 26.0
## AR : 1 Mean : 64.7
## AZ : 1 3rd Qu.: 55.0
## (Other):51 Max. :859.0
## NA's : 1
plt4 <- Comp_state %>%
ggplot()+
geom_col(aes(x = state_code, y = total_founded, fill = state_code))+
coord_flip()+
xlab("State")+
ylab("Companies Founded")+
ggtitle("North America Biotech Start-Ups")
plt4

How is funding related to operational status?
The code below shows that on average there is more funding assocaited with biotechnology companies that have been acquired.
Bt2 <- Biotech %>%
select(c(3, 4)) %>%
group_by(status) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_status"))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
head(Bt2)
## # A tibble: 4 x 2
## status mfunding_per_status
## <fct> <dbl>
## 1 acquired 36644074.
## 2 closed 22510100.
## 3 operating 19728717.
## 4 <NA> 25281727.
# view(Bt2)
What is the average amount of funding on a global level, North America level, on a city level, and per year?
The following chucks of code create subsetted datasets containing the desired information.
Bt3 shows the average funding per country.
Bt4 shows the average funding per state/province in North America.
Bt5 shows the average funding per city, globally.
Bt6 shows the average funding per year.
Bt3 <- Biotech %>%
select(c(3, 5)) %>%
group_by(country_code) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_country"))
## Warning: Factor `country_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt3)
## # A tibble: 6 x 2
## country_code mfunding_per_country
## <fct> <dbl>
## 1 ARE 12000000
## 2 ARG NaN
## 3 AUS 9508229.
## 4 AUT 13802606.
## 5 BEL 33628258.
## 6 BRA 4855828.
# view(Bt3)
Bt4 <- Biotech %>%
select(c(3, 6)) %>%
group_by(state_code) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_state"))
## Warning: Factor `state_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt4)
## # A tibble: 6 x 2
## state_code mfunding_per_state
## <fct> <dbl>
## 1 AB 14295781.
## 2 AK 45000
## 3 AL 4834702.
## 4 AR 10616017.
## 5 AZ 11963997.
## 6 BC 10212936.
# view(Bt4)
Bt5 <- Biotech %>%
select(c(3, 7)) %>%
group_by(city) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_city"))
## Warning: Factor `city` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt5)
## # A tibble: 6 x 2
## city mfunding_per_city
## <fct> <dbl>
## 1 "\xc7an" 15270909
## 2 "\xc9cully" 7136390.
## 3 "\xc9vry" 387000
## 4 "A Coru\xf1a" 2033700
## 5 "Aachen" 7980528
## 6 "Aarhus" 5220000
# view(Bt5)
Bt6 <- Biotech %>%
select(c(3, 9)) %>%
group_by(founded_year) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mean_funding"))
head(Bt6)
## # A tibble: 6 x 2
## founded_year mean_funding
## <int> <dbl>
## 1 1912 3500000
## 2 1919 18100000
## 3 1921 48000
## 4 1928 100000000
## 5 1945 7400000
## 6 1947 34000000
# view(Bt6)
Where are there NA values for ‘funding_total_usd’ and what countries are they associated with?
The below is a simple scatter plot that shows how the average funding for biotechnology companies has changed over time.
plottt <- Bt6 %>%
ggplot()+
geom_point(aes(x = founded_year, y = mean_funding))
plottt
## Warning: Removed 1 rows containing missing values (geom_point).

How does the mean and the total funding compare for each of the levels of interest? How does the average compare with the theoretical allocation of funds (i.e. how does the mean compare to the total funding/the number of companies founded)?
Each “Analysis” is the dataset that contains the answer to this question at the various levels of interest.
Bt7 <- Biotech %>%
select(c(3, 9)) %>%
group_by(founded_year) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sum_funding"))
head(Bt7)
## # A tibble: 6 x 2
## founded_year sum_funding
## <int> <dbl>
## 1 1912 3500000
## 2 1919 18100000
## 3 1921 48000
## 4 1928 100000000
## 5 1945 7400000
## 6 1947 34000000
# view(Bt7)
Temp <- merge(Bt6, Bt7, by = 'founded_year')
tail(Temp)
## founded_year mean_funding sum_funding
## 48 2010 12738424 2394823700
## 49 2011 11578476 1991497862
## 50 2012 8639931 1235510120
## 51 2013 11797488 967394047
## 52 2014 4942266 34595862
## 53 NA 16420586 18850832645
Analysis1 <- merge(Temp, Comp_year, by = 'founded_year') %>%
mutate(funds_per_comp = sum_funding / total_comp_founded)
tail(Analysis1)
## founded_year mean_funding sum_funding total_comp_founded funds_per_comp
## 47 2009 11337623 2165486012 195 11105056
## 48 2010 12738424 2394823700 197 12156465
## 49 2011 11578476 1991497862 180 11063877
## 50 2012 8639931 1235510120 155 7971033
## 51 2013 11797488 967394047 92 10515153
## 52 2014 4942266 34595862 9 3843985
# view(Analysis1)
Bt8 <- Biotech %>%
select(c(3, 6)) %>%
group_by(state_code) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sfunding_per_state"))
## Warning: Factor `state_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt8)
## # A tibble: 6 x 2
## state_code sfunding_per_state
## <fct> <dbl>
## 1 AB 157253596
## 2 AK 45000
## 3 AL 125702241
## 4 AR 63696100
## 5 AZ 358919918
## 6 BC 234897529
# view(Bt8)
Temp2 <- merge(Bt4, Bt8, by = 'state_code')
tail(Temp2)
## state_code mfunding_per_state sfunding_per_state
## 52 VA 16837485 656661896
## 53 WA 6329623 468392091
## 54 WI 8015495 312604315
## 55 WV 4318014 8636027
## 56 WY 1305000 1305000
## 57 <NA> 20226494 15594627186
Analysis2 <- merge(Temp2, Comp_state, by = 'state_code') %>%
mutate(funds_per_comp = sfunding_per_state / total_founded) %>%
rename(c("state_code" = "state"))
tail(Analysis2)
## state mfunding_per_state sfunding_per_state total_founded funds_per_comp
## 52 VA 16837485 656661896 45 14592487
## 53 WA 6329623 468392091 75 6245228
## 54 WI 8015495 312604315 39 8015495
## 55 WV 4318014 8636027 2 4318014
## 56 WY 1305000 1305000 1 1305000
## 57 <NA> 20226494 15594627186 859 18154397
view(Analysis2)
Bt9 <- Biotech %>%
select(c(3, 5)) %>%
group_by(country_code) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sfunding_per_country"))
## Warning: Factor `country_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt9)
## # A tibble: 6 x 2
## country_code sfunding_per_country
## <fct> <dbl>
## 1 ARE 12000000
## 2 ARG 0
## 3 AUS 285246867
## 4 AUT 165631267
## 5 BEL 672565168
## 6 BRA 24279142
# view(Bt9)
Temp3 <- merge(Bt3, Bt9, by = 'country_code')
tail(Temp3)
## country_code mfunding_per_country sfunding_per_country
## 39 THA NaN 0
## 40 TUR 47890682 191562727
## 41 TWN 88158013 264474040
## 42 USA 21222290 56111735974
## 43 ZAF 2850000 2850000
## 44 <NA> 12832482 1527065388
Analysis3 <- merge(Temp3, Comp_country, by = 'country_code') %>%
mutate(funds_per_comp = sfunding_per_country / total_founded) %>%
rename(c("country_code" = "country"))
tail(Analysis3)
## country mfunding_per_country sfunding_per_country total_founded
## 39 THA NaN 0 1
## 40 TUR 47890682 191562727 4
## 41 TWN 88158013 264474040 3
## 42 USA 21222290 56111735974 2716
## 43 ZAF 2850000 2850000 1
## 44 <NA> 12832482 1527065388 132
## funds_per_comp
## 39 0
## 40 47890682
## 41 88158013
## 42 20659697
## 43 2850000
## 44 11568677
view(Analysis3)
Comp_city <- count(Biotech, "city") %>%
rename(c("freq" = "total_founded"))
# view(Comp_state)
summary(Comp_state)
## state_code total_founded
## AB : 1 Min. : 1.0
## AK : 1 1st Qu.: 7.0
## AL : 1 Median : 26.0
## AR : 1 Mean : 64.7
## AZ : 1 3rd Qu.: 55.0
## (Other):51 Max. :859.0
## NA's : 1
Bt10 <- Biotech %>%
select(c(3, 7)) %>%
group_by(city) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sfunding_per_city"))
## Warning: Factor `city` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt10)
## # A tibble: 6 x 2
## city sfunding_per_city
## <fct> <dbl>
## 1 "\xc7an" 45812727
## 2 "\xc9cully" 14272781
## 3 "\xc9vry" 387000
## 4 "A Coru\xf1a" 2033700
## 5 "Aachen" 15961056
## 6 "Aarhus" 5220000
# view(Bt10)
Temp4 <- merge(Bt5, Bt10, by = 'city')
tail(Temp4)
## city mfunding_per_city sfunding_per_city
## 1037 Zevenbergen NaN 0
## 1038 Zhuhai 1762820 1762820
## 1039 Zug 2800000 2800000
## 1040 Zwijnaarde 50414506 100829011
## 1041 Zwingenberg 77766000 77766000
## 1042 <NA> 14195965 2171982642
Analysis4 <- merge(Temp4, Comp_city, by = 'city') %>%
mutate(funds_per_comp = sfunding_per_city / total_founded)
tail(Analysis4)
## city mfunding_per_city sfunding_per_city total_founded
## 1037 Zevenbergen NaN 0 1
## 1038 Zhuhai 1762820 1762820 1
## 1039 Zug 2800000 2800000 1
## 1040 Zwijnaarde 50414506 100829011 2
## 1041 Zwingenberg 77766000 77766000 1
## 1042 <NA> 14195965 2171982642 170
## funds_per_comp
## 1037 0
## 1038 1762820
## 1039 2800000
## 1040 50414506
## 1041 77766000
## 1042 12776368
view(Analysis4)