Data Importing and Cleaning/Tidying
Importing The Data (CPI Data and VC Investment Data from 1913 to 2014)
library(anytime)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
library(stringr)
setwd("~/Desktop/R Datasets/Data Science Datasets")
vc_data <- read.csv("investments_VC.csv",na.strings = "",stringsAsFactors = FALSE)
library("jsonlite")
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
##
## flatten
json_file <- 'https://datahub.io/core/cpi-us/datapackage.json'
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
## Warning in readLines(json_file): incomplete final line found on 'https://
## datahub.io/core/cpi-us/datapackage.json'
# get list of all resources:
print(json_data$resources$name)
## [1] "validation_report" "cpiai_csv" "cpiai_json"
## [4] "cpi-us_zip" "cpiai"
# print all tabular data(if exists any)
for(i in 1:length(json_data$resources$datahub$type)){
if(json_data$resources$datahub$type[i]=='derived/csv'){
path_to_file = json_data$resources$path[i]
data <- read.csv(url(path_to_file))
}
}
c_code_df <- read.csv("CountryCode.csv")
c_data_df <- read.csv("CountryData.csv")
Structure, head, and tail of Data
str(vc_data)
## 'data.frame': 49438 obs. of 34 variables:
## $ name : chr "#waywire" "&TV Communications" "'Rock' Your Paper" "(In)Touch Network" ...
## $ category_list : chr "|Entertainment|Politics|Social Media|News|" "|Games|" "|Publishing|Education|" "|Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|" ...
## $ market : chr " News " " Games " " Publishing " " Electronics " ...
## $ funding_total_usd : chr " 17,50,000 " " 40,00,000 " "40,000" " 15,00,000 " ...
## $ status : chr "acquired" "operating" "operating" "operating" ...
## $ country_code : chr "USA" "USA" "EST" "GBR" ...
## $ state_code : chr "NY" "CA" NA NA ...
## $ region : chr "New York City" "Los Angeles" "Tallinn" "London" ...
## $ city : chr "New York" "Los Angeles" "Tallinn" "London" ...
## $ funding_rounds : int 1 2 1 1 2 1 1 1 1 1 ...
## $ founded_at : chr "6/1/12" NA "10/26/12" "4/1/11" ...
## $ founded_month : chr "2012-06" NA "2012-10" "2011-04" ...
## $ founded_quarter : chr "2012-Q2" NA "2012-Q4" "2011-Q2" ...
## $ founded_year : int 2012 NA 2012 2011 2014 2011 NA 2007 2010 NA ...
## $ first_funding_at : chr "6/30/12" "6/4/10" "8/9/12" "4/1/11" ...
## $ last_funding_at : chr "6/30/12" "9/23/10" "8/9/12" "4/1/11" ...
## $ seed : int 1750000 0 40000 1500000 0 0 0 0 0 41250 ...
## $ venture : num 0e+00 4e+06 0e+00 0e+00 0e+00 7e+06 0e+00 2e+06 0e+00 0e+00 ...
## $ equity_crowdfunding : int 0 0 0 0 60000 0 0 0 0 0 ...
## $ undisclosed : int 0 0 0 0 0 0 4912393 0 0 0 ...
## $ convertible_note : int 0 0 0 0 0 0 0 0 0 0 ...
## $ debt_financing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ angel : int 0 0 0 0 0 0 0 0 0 0 ...
## $ grant : int 0 0 0 0 0 0 0 0 0 0 ...
## $ private_equity : num 0 0 0 0 0 0 0 0 0 0 ...
## $ post_ipo_equity : num 0 0 0 0 0 0 0 0 0 0 ...
## $ post_ipo_debt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ secondary_market : int 0 0 0 0 0 0 0 0 0 0 ...
## $ product_crowdfunding: int 0 0 0 0 0 0 0 0 0 0 ...
## $ round_A : int 0 0 0 0 0 0 0 2000000 0 0 ...
## $ round_B : int 0 0 0 0 0 7000000 0 0 0 0 ...
## $ round_C : int 0 0 0 0 0 0 0 0 0 0 ...
## $ round_D : int 0 0 0 0 0 0 0 0 0 0 ...
## $ round_E : int 0 0 0 0 0 0 0 0 0 0 ...
head(vc_data)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 6/1/12 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 10/26/12 2012-10
## 4 London London 1 4/1/11 2011-04
## 5 Dallas Fort Worth 2 1/1/14 2014-01
## 6 Ft. Lauderdale Oakland Park 1 10/10/11 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 6/30/12 6/30/12 1750000 0e+00
## 2 <NA> NA 6/4/10 9/23/10 0 4e+06
## 3 2012-Q4 2012 8/9/12 8/9/12 40000 0e+00
## 4 2011-Q2 2011 4/1/11 4/1/11 1500000 0e+00
## 5 2014-Q1 2014 8/17/14 9/26/14 0 0e+00
## 6 2011-Q4 2011 5/31/13 5/31/13 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 7000000 0 0 0
tail(vc_data)
## name
## 49433 Zytoprotec
## 49434 Zzish
## 49435 ZZNode Science and Technology
## 49436 Zzzzapp Wireless ltd.
## 49437 [a]list games
## 49438 [x+1]
## category_list
## 49433 |Biotechnology|
## 49434 |Analytics|Gamification|Developer APIs|iOS|Android|Education|
## 49435 |Enterprise Software|
## 49436 |Web Development|Advertising|Wireless|Mobile|
## 49437 |Games|
## 49438 |Enterprise Software|
## market funding_total_usd status country_code state_code
## 49433 Biotechnology 26,86,600 operating AUT <NA>
## 49434 Education 3,20,000 operating GBR <NA>
## 49435 Enterprise Software 15,87,301 operating CHN <NA>
## 49436 Web Development 97,398 operating HRV <NA>
## 49437 Games 93,00,000 operating <NA> <NA>
## 49438 Enterprise Software 4,50,00,000 operating USA NY
## region city funding_rounds founded_at founded_month
## 49433 Vienna Gerasdorf Bei Wien 1 1/1/07 2007-01
## 49434 London London 1 1/28/13 2013-01
## 49435 Beijing Beijing 1 <NA> <NA>
## 49436 Split Split 5 5/13/12 2012-05
## 49437 <NA> <NA> 1 <NA> <NA>
## 49438 New York City New York 4 1/1/99 1999-01
## founded_quarter founded_year first_funding_at last_funding_at seed
## 49433 2007-Q1 2007 1/29/13 1/29/13 0
## 49434 2013-Q1 2013 3/24/14 3/24/14 320000
## 49435 <NA> NA 4/1/12 4/1/12 0
## 49436 2012-Q2 2012 11/1/11 9/10/14 71525
## 49437 <NA> NA 11/21/11 11/21/11 9300000
## 49438 1999-Q1 1999 6/1/08 4/4/13 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 49433 2686600 0 0 0 0.0e+00
## 49434 0 0 0 0 0.0e+00
## 49435 1587301 0 0 0 0.0e+00
## 49436 0 0 0 25873 0.0e+00
## 49437 0 0 0 0 0.0e+00
## 49438 28000000 0 0 0 1.7e+07
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 49433 0 0 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 0 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 49433 0 2686600 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 1587301 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 16000000 10000000 0 0 0
Mutate character variables into factors and dates
vc_data2 <- vc_data %>% mutate(category_list = as.factor(category_list), market = as.factor(market), status = as.factor(status), country_code = as.factor(country_code), state_code = as.factor(state_code), city = as.factor(city), founded_at = mdy(founded_at), first_funding_at = mdy(first_funding_at), last_funding_at = mdy(last_funding_at))
## Warning: 68 failed to parse.
## Warning: 10 failed to parse.
## Warning: 6 failed to parse.
head(vc_data2)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 2012-10
## 4 London London 1 2011-04-01 2011-04
## 5 Dallas Fort Worth 2 2014-01-01 2014-01
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 2012-Q4 2012 2012-08-09 2012-08-09 40000 0e+00
## 4 2011-Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 2014-Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 2011-Q4 2011 2013-05-31 2013-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 7000000 0 0 0
Identify date issues
# Identify dates where first funding occurs before company founded
fund_date_issue <- vector(mode = "logical", length = nrow(vc_data2))
for(i in 1:nrow(vc_data2)){
if(difftime(vc_data2[i,"founded_at"],vc_data2[i,"first_funding_at"], units = "days") > 0 & !is.na(vc_data2[i,"founded_at"]) & !is.na(vc_data2[i,"first_funding_at"])){
vc_data2[i,"first_funding_at"] = vc_data2[i,"founded_at"]
fund_date_issue[i] = TRUE
}
}
fund_date_issue2 <- data.frame(fund_date_issue)
vc_data3 <- cbind(vc_data2,fund_date_issue2)
head(vc_data3)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 2012-10
## 4 London London 1 2011-04-01 2011-04
## 5 Dallas Fort Worth 2 2014-01-01 2014-01
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 2012-Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 2011-Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 2014-Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 2011-Q4 2011 2013-05-31 2013-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
# Identify dates in the founded_at column and set their year values equal to the founded_year value
head(vc_data3 %>% arrange(desc(founded_at)))
## name category_list
## 1 Anacomp |Consulting|
## 2 Casey's General Stores |E-Commerce|
## 3 Gruppo Argenta |Recreation|Hospitality|
## 4 Healthcare Corporation of America |Biotechnology|
## 5 Matthew Walker Comprehensive Health Center |Nonprofits|
## 6 Netsmart Technologies |Enterprise Software|
## market funding_total_usd status country_code state_code
## 1 Consulting 75,00,000 acquired USA CA
## 2 E-Commerce 56,90,00,000 operating USA IA
## 3 Recreation 12,86,60,000 operating <NA> <NA>
## 4 Biotechnology 91,25,000 operating USA TN
## 5 Nonprofits 1,73,210 operating USA TN
## 6 Enterprise Software 45,00,000 operating USA NY
## region city funding_rounds founded_at founded_month
## 1 San Diego San Diego 1 2068-01-01 1968-01
## 2 Des Moines Ankeny 1 2068-01-01 1968-01
## 3 <NA> <NA> 1 2068-01-01 1968-01
## 4 Nashville Nashville 2 2068-01-01 1968-01
## 5 Nashville Nashville 1 2068-01-01 1968-01
## 6 NY - Other Great River 1 2068-01-01 1968-01
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 1968-Q1 1968 2068-01-01 2009-06-08 0 7500000
## 2 1968-Q1 1968 2068-01-01 2010-08-23 0 0
## 3 1968-Q1 1968 2068-01-01 2014-01-21 0 0
## 4 1968-Q1 1968 2068-01-01 2014-06-25 0 0
## 5 1968-Q1 1968 2068-01-01 2014-02-12 0 0
## 6 1968-Q1 1968 2068-01-01 2005-09-19 0 4500000
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0.00e+00 0 0
## 2 0 0 0 5.69e+08 0 0
## 3 0 0 0 0.00e+00 0 0
## 4 0 0 0 3.20e+06 0 0
## 5 0 0 0 0.00e+00 0 173210
## 6 0 0 0 0.00e+00 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 128660000 0 0 0
## 4 5925000 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 TRUE
## 2 0 0 0 0 0 0 TRUE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 TRUE
## 5 0 0 0 0 0 0 TRUE
## 6 0 0 0 0 0 0 TRUE
for(i in 1:nrow(vc_data3)){
if(!is.na(vc_data3[i,"founded_at"]) & vc_data3[i,"founded_at"] > 2014-12-13){
year(vc_data3[i,"founded_at"]) = vc_data3[i,"founded_year"]
year(vc_data3[i,"first_funding_at"]) = vc_data3[i,"founded_year"]
year(vc_data3[i,"last_funding_at"]) = vc_data3[i,"founded_year"]
}
}
head(vc_data3)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 17,50,000 acquired USA NY
## 2 Games 40,00,000 operating USA CA
## 3 Publishing 40,000 operating EST <NA>
## 4 Electronics 15,00,000 operating GBR <NA>
## 5 Tourism 60,000 operating USA TX
## 6 Software 70,00,000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 2012-06
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 2012-10
## 4 London London 1 2011-04-01 2011-04
## 5 Dallas Fort Worth 2 2014-01-01 2014-01
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 2011-10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 2012-Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 2012-Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 2011-Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 2014-Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 2011-Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
tail(vc_data3)
## name
## 49433 Zytoprotec
## 49434 Zzish
## 49435 ZZNode Science and Technology
## 49436 Zzzzapp Wireless ltd.
## 49437 [a]list games
## 49438 [x+1]
## category_list
## 49433 |Biotechnology|
## 49434 |Analytics|Gamification|Developer APIs|iOS|Android|Education|
## 49435 |Enterprise Software|
## 49436 |Web Development|Advertising|Wireless|Mobile|
## 49437 |Games|
## 49438 |Enterprise Software|
## market funding_total_usd status country_code state_code
## 49433 Biotechnology 26,86,600 operating AUT <NA>
## 49434 Education 3,20,000 operating GBR <NA>
## 49435 Enterprise Software 15,87,301 operating CHN <NA>
## 49436 Web Development 97,398 operating HRV <NA>
## 49437 Games 93,00,000 operating <NA> <NA>
## 49438 Enterprise Software 4,50,00,000 operating USA NY
## region city funding_rounds founded_at founded_month
## 49433 Vienna Gerasdorf Bei Wien 1 2007-01-01 2007-01
## 49434 London London 1 2013-01-28 2013-01
## 49435 Beijing Beijing 1 <NA> <NA>
## 49436 Split Split 5 2012-05-13 2012-05
## 49437 <NA> <NA> 1 <NA> <NA>
## 49438 New York City New York 4 1999-01-01 1999-01
## founded_quarter founded_year first_funding_at last_funding_at seed
## 49433 2007-Q1 2007 2007-01-29 2007-01-29 0
## 49434 2013-Q1 2013 2013-03-24 2013-03-24 320000
## 49435 <NA> NA 2012-04-01 2012-04-01 0
## 49436 2012-Q2 2012 2012-05-13 2012-09-10 71525
## 49437 <NA> NA 2011-11-21 2011-11-21 9300000
## 49438 1999-Q1 1999 1999-06-01 1999-04-04 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 49433 2686600 0 0 0 0.0e+00
## 49434 0 0 0 0 0.0e+00
## 49435 1587301 0 0 0 0.0e+00
## 49436 0 0 0 25873 0.0e+00
## 49437 0 0 0 0 0.0e+00
## 49438 28000000 0 0 0 1.7e+07
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 49433 0 0 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 0 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 49433 0 2686600 0 0 0 0
## 49434 0 0 0 0 0 0
## 49435 0 1587301 0 0 0 0
## 49436 0 0 0 0 0 0
## 49437 0 0 0 0 0 0
## 49438 0 16000000 10000000 0 0 0
## fund_date_issue
## 49433 FALSE
## 49434 FALSE
## 49435 FALSE
## 49436 TRUE
## 49437 FALSE
## 49438 FALSE
Remove commas from total funding column and extract month and quarter from founded_month and founded_quarter columns
vc_data3$funding_total_usd<- as.numeric(gsub(",","",vc_data3$funding_total_usd))
## Warning: NAs introduced by coercion
vc_data4 <- vc_data3 %>% mutate(founded_month = as.integer(substr(founded_month,6,7)), founded_quarter = as.factor(substr(founded_quarter,6,7)))
head(vc_data4)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 6
## 2 Los Angeles Los Angeles 2 <NA> NA
## 3 Tallinn Tallinn 1 2012-10-26 10
## 4 London London 1 2011-04-01 4
## 5 Dallas Fort Worth 2 2014-01-01 1
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 10
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
Change founded_month integer variable to a factor with levels equal to the names of months
month <- vector(mode = "character", length = nrow(vc_data4))
for(i in 1:nrow(vc_data4)){
if(is.na(vc_data4[i,"founded_month"])){
month[i] = NA
}else if(vc_data4[i,"founded_month"] == 1){
month[i] = "January"
} else if(vc_data4[i,"founded_month"] == 2){
month[i] = "February"
} else if(vc_data4[i,"founded_month"] == 3){
month[i] = "March"
}else if(vc_data4[i,"founded_month"] == 4){
month[i] = "April"
}else if(vc_data4[i,"founded_month"] == 5){
month[i] = "May"
}else if(vc_data4[i,"founded_month"] == 6){
month[i] = "June"
}else if(vc_data4[i,"founded_month"] == 7){
month[i] = "July"
}else if(vc_data4[i,"founded_month"] == 8){
month[i] = "August"
}else if(vc_data4[i,"founded_month"] == 9){
month[i] = "September"
}else if(vc_data4[i,"founded_month"] == 10){
month[i] = "October"
}else if(vc_data4[i,"founded_month"] == 11){
month[i] = "November"
}else if(vc_data4[i,"founded_month"] == 12){
month[i] = "December"
}
}
for(i in 1:nrow(vc_data4)){
vc_data4[i,12] = month[i]
}
head(vc_data4)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
vc_data5<- vc_data4 %>% mutate(founded_month = as.factor(founded_month))
head(vc_data5)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
Group related market variable categories together to reduce the number of categories
markets2 <- vector(mode = "character", length = nrow(vc_data5))
markets3 <- factor(markets2,levels = c("Internet", "Big Data Analytics/Security", "Apps/Social Media", "Energy","Health/Medicine/Biotech", "Finance","Other"))
vc_data_clean <- cbind(vc_data5,markets3)
head(vc_data_clean)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
## markets3
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
for(i in 1:nrow(vc_data5)){
if(is.na(vc_data5[i,"market"])){
vc_data_clean[i,"markets3"] = NA
} else if(str_detect(vc_data5[i,"market"],"Web") | str_detect(vc_data5[i,"market"],"Internet") | str_detect(vc_data5[i,"market"],"Cloud") | str_detect(vc_data5[i,"market"],"IT") | str_detect(vc_data5[i,"market"],"E-Commerce") | str_detect(vc_data5[i,"market"],"Online") | str_detect(vc_data5[i,"market"],"Information") | str_detect(vc_data5[i,"market"],"Enterprise")){
vc_data_clean[i,"markets3"] = "Internet"
} else if(str_detect(vc_data5[i,"market"],"Augmented Reality") | str_detect(vc_data5[i,"market"],"Data") | str_detect(vc_data5[i,"market"],"Artificial Intelligence") | str_detect(vc_data5[i,"market"],"Cyber") | str_detect(vc_data5[i,"market"],"Security") | str_detect(vc_data5[i,"market"],"Machine Learning") | str_detect(vc_data5[i,"market"],"Analytics") | str_detect(vc_data5[i,"market"],"Algorithms")){
vc_data_clean[i,"markets3"] = "Big Data Analytics/Security"
} else if(str_detect(vc_data5[i,"market"],"Mobile") | str_detect(vc_data5[i,"market"],"Social")){
vc_data_clean[i,"markets3"] = "Apps/Social Media"
} else if(str_detect(vc_data5[i,"market"],"Energy") | str_detect(vc_data5[i,"market"],"Biofeuls") | str_detect(vc_data5[i,"market"],"Biomass") | str_detect(vc_data5[i,"market"],"Clean") | str_detect(vc_data5[i,"market"],"Green") | str_detect(vc_data5[i,"market"],"Natural") | str_detect(vc_data5[i,"market"],"Oil") | str_detect(vc_data5[i,"market"],"Fuels") | str_detect(vc_data5[i,"market"],"Renewable")){
vc_data_clean[i,"markets3"] = "Energy"
} else if(str_detect(vc_data5[i,"market"],"Health") | str_detect(vc_data5[i,"market"],"Biometrics") | str_detect(vc_data5[i,"market"],"Biotechnology") | str_detect(vc_data5[i,"market"],"Medical")){
vc_data_clean[i,"markets3"] = "Health/Medicine/Biotech"
} else if(str_detect(vc_data5[i,"market"],"Finance") | str_detect(vc_data5[i,"market"],"Financial") | str_detect(vc_data5[i,"market"],"Banking") | str_detect(vc_data5[i,"market"],"Bitcoin") | str_detect(vc_data5[i,"market"],"P2P")){
vc_data_clean[i,"markets3"] = "Finance"
} else {
vc_data_clean[i,"markets3"] = "Other"
}
}
head(vc_data_clean)
## name
## 1 #waywire
## 2 &TV Communications
## 3 'Rock' Your Paper
## 4 (In)Touch Network
## 5 -R- Ranch and Mine
## 6 .Club Domains
## category_list
## 1 |Entertainment|Politics|Social Media|News|
## 2 |Games|
## 3 |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5 |Tourism|Entertainment|Games|
## 6 |Software|
## market funding_total_usd status country_code state_code
## 1 News 1750000 acquired USA NY
## 2 Games 4000000 operating USA CA
## 3 Publishing 40000 operating EST <NA>
## 4 Electronics 1500000 operating GBR <NA>
## 5 Tourism 60000 operating USA TX
## 6 Software 7000000 <NA> USA FL
## region city funding_rounds founded_at founded_month
## 1 New York City New York 1 2012-06-01 June
## 2 Los Angeles Los Angeles 2 <NA> <NA>
## 3 Tallinn Tallinn 1 2012-10-26 October
## 4 London London 1 2011-04-01 April
## 5 Dallas Fort Worth 2 2014-01-01 January
## 6 Ft. Lauderdale Oakland Park 1 2011-10-10 October
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q2 2012 2012-06-30 2012-06-30 1750000 0e+00
## 2 <NA> NA 2010-06-04 2010-09-23 0 4e+06
## 3 Q4 2012 2012-10-26 2012-08-09 40000 0e+00
## 4 Q2 2011 2011-04-01 2011-04-01 1500000 0e+00
## 5 Q1 2014 2014-08-17 2014-09-26 0 0e+00
## 6 Q4 2011 2011-05-31 2011-05-31 0 7e+06
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 60000 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 0 0 0 0 0 0 TRUE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 7000000 0 0 0 FALSE
## markets3
## 1 Other
## 2 Other
## 3 Other
## 4 Other
## 5 Other
## 6 Other
Geographical Data with Cleaned VC Data
c_code_df <- subset(c_code_df, select = -c(Country.1))
c_code_df <- c_code_df %>% rename(country = Country, country_code = Country.Code)
c_data_df <- c_data_df %>% rename(country = Country, region = Region, population = Population, area_sq_mi = Area..sq..mi.., pop_dens_per_mi = Pop..Density..per.sq..mi.., gdp_per_capita = GDP....per.capita., literacy = Literacy...., phones_per_1000 = Phones..per.1000.)
geographical_df1 <- merge(c_data_df,c_code_df)
geographical_df <- merge.data.frame(vc_data_clean, geographical_df1, by.x="country_code", by.y ="country_code")
geographical_df <- geographical_df %>% rename(global_region = region.y, region = region.x)
Create categorical variable for numerical venture variable
vc_data_clean2 <- vc_data_clean %>% filter(fund_date_issue != TRUE, !is.na(venture), !is.na(first_funding_at), venture > 0)
vc_fund_cats1 <- vc_data_clean2$venture
range(vc_data_clean2$venture)
## [1] 2.200e+01 2.351e+09
vc_data_clean2$venture_cats <- cut(vc_fund_cats1, 20, labels = c("0 to 117 million", "117 to 235 million","235 to 352 million","352 to 470 million","470 to 588 million","588 to 705 million","705 to 823 million","823 to 940 million","940 million to 1 billion","1 to 1.175 billion","1.17 to 1.3 billion","1.3 to 1.4 billion","1.4 to 1.53 billion","1.53 to 1.64 billion","1.64 to 1.76 billion", "1.76 to 1.88 billion", "1.88 to 1.99 billion","1.99 to 2.11 billion","2.11 to 2.23 billion","2.23 to 2.35 billion"))
head(vc_data_clean2)
## name category_list market
## 1 &TV Communications |Games| Games
## 2 .Club Domains |Software| Software
## 3 0-6.com |Curated Web| Curated Web
## 4 10 Minutes With |Education| Education
## 5 1000museums.com |Curated Web| Curated Web
## 6 1001 Menus |Local Businesses|Restaurants| Restaurants
## funding_total_usd status country_code state_code region
## 1 4000000 operating USA CA Los Angeles
## 2 7000000 <NA> USA FL Ft. Lauderdale
## 3 2000000 operating <NA> <NA> <NA>
## 4 4400000 operating GBR <NA> London
## 5 4962651 operating USA WA Seattle
## 6 4059079 operating FRA <NA> Paris
## city funding_rounds founded_at founded_month founded_quarter
## 1 Los Angeles 2 <NA> <NA> <NA>
## 2 Oakland Park 1 2011-10-10 October Q4
## 3 <NA> 1 2007-01-01 January Q1
## 4 London 2 2013-01-01 January Q1
## 5 Bellevue 6 2008-01-01 January Q1
## 6 Paris 4 2010-11-20 November Q4
## founded_year first_funding_at last_funding_at seed venture
## 1 NA 2010-06-04 2010-09-23 0 4000000
## 2 2011 2011-05-31 2011-05-31 0 7000000
## 3 2007 2007-03-19 2007-03-19 0 2000000
## 4 2013 2013-01-01 2013-10-09 400000 4000000
## 5 2008 2008-10-14 2008-09-19 0 3814772
## 6 2010 2010-12-15 2010-11-13 522169 3536910
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 1147879 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 7000000 0 0 0 FALSE
## 3 0 2000000 0 0 0 0 FALSE
## 4 0 4000000 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 3536910 0 0 0 0 FALSE
## markets3 venture_cats
## 1 Other 0 to 117 million
## 2 Other 0 to 117 million
## 3 Internet 0 to 117 million
## 4 Other 0 to 117 million
## 5 Internet 0 to 117 million
## 6 Other 0 to 117 million
Take a sample of 500 observations of venture funding and adjust the nominal venture funding for inflation such that funding from any time period is expressed in 2014 prices
real_venture<-vector(mode = "numeric", length = nrow(vc_data_clean2))
vc_data_clean2 <- cbind(vc_data_clean2,real_venture)
head(vc_data_clean2)
## name category_list market
## 1 &TV Communications |Games| Games
## 2 .Club Domains |Software| Software
## 3 0-6.com |Curated Web| Curated Web
## 4 10 Minutes With |Education| Education
## 5 1000museums.com |Curated Web| Curated Web
## 6 1001 Menus |Local Businesses|Restaurants| Restaurants
## funding_total_usd status country_code state_code region
## 1 4000000 operating USA CA Los Angeles
## 2 7000000 <NA> USA FL Ft. Lauderdale
## 3 2000000 operating <NA> <NA> <NA>
## 4 4400000 operating GBR <NA> London
## 5 4962651 operating USA WA Seattle
## 6 4059079 operating FRA <NA> Paris
## city funding_rounds founded_at founded_month founded_quarter
## 1 Los Angeles 2 <NA> <NA> <NA>
## 2 Oakland Park 1 2011-10-10 October Q4
## 3 <NA> 1 2007-01-01 January Q1
## 4 London 2 2013-01-01 January Q1
## 5 Bellevue 6 2008-01-01 January Q1
## 6 Paris 4 2010-11-20 November Q4
## founded_year first_funding_at last_funding_at seed venture
## 1 NA 2010-06-04 2010-09-23 0 4000000
## 2 2011 2011-05-31 2011-05-31 0 7000000
## 3 2007 2007-03-19 2007-03-19 0 2000000
## 4 2013 2013-01-01 2013-10-09 400000 4000000
## 5 2008 2008-10-14 2008-09-19 0 3814772
## 6 2010 2010-12-15 2010-11-13 522169 3536910
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 1147879 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 7000000 0 0 0 FALSE
## 3 0 2000000 0 0 0 0 FALSE
## 4 0 4000000 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 3536910 0 0 0 0 FALSE
## markets3 venture_cats real_venture
## 1 Other 0 to 117 million 0
## 2 Other 0 to 117 million 0
## 3 Internet 0 to 117 million 0
## 4 Other 0 to 117 million 0
## 5 Internet 0 to 117 million 0
## 6 Other 0 to 117 million 0
set.seed(123)
vc_data_clean3 <- sample_n(vc_data_clean2,500)
for(i in 1:nrow(data)){
for(j in 1:nrow(vc_data_clean3)){
if(!is.na(vc_data_clean3[j,"first_funding_at"]) & !is.na(vc_data_clean3[j,"venture"]) & difftime(vc_data_clean3[j,"first_funding_at"], data[i,"Date"], units = "days") >= 0 & difftime(vc_data_clean3[j,"first_funding_at"], data[i,"Date"], units = "days") < 31){
vc_data_clean3[j,"real_venture"] = vc_data_clean3[j,"venture"]*(data[1213,"Index"]/data[i,"Index"])
}
}
}
head(vc_data_clean3)
## name category_list
## 1 Sunbeam |Biotechnology|
## 2 SuperOx Wastewater Co |Manufacturing|
## 3 Brain Tunnelgenix Technologies |Biotechnology|
## 4 Audioscribe <NA>
## 5 Canopy Labs |Lead Generation|Analytics|
## 6 Manta |Professional Networking|Curated Web|
## market funding_total_usd status country_code state_code
## 1 Biotechnology 9329636 operating USA FL
## 2 Manufacturing 1700000 operating USA TX
## 3 Biotechnology 2563168 operating USA CT
## 4 <NA> 1499800 operating <NA> <NA>
## 5 Lead Generation 2064000 <NA> USA CA
## 6 Curated Web 47215715 operating USA OH
## region city funding_rounds founded_at founded_month
## 1 Ft. Lauderdale Fort Lauderdale 1 2007-01-01 January
## 2 Houston Houston 1 2010-01-01 January
## 3 Hartford Bridgeport 3 2006-01-01 January
## 4 <NA> <NA> 2 <NA> <NA>
## 5 SF Bay Area San Francisco 2 2012-01-01 January
## 6 Columbus, Ohio Columbus 2 2005-09-01 September
## founded_quarter founded_year first_funding_at last_funding_at seed
## 1 Q1 2007 2007-12-11 2007-12-11 0
## 2 Q1 2010 2010-05-24 2010-05-24 0
## 3 Q1 2006 2006-04-16 2006-09-17 100000
## 4 <NA> NA 2012-07-01 2012-10-01 0
## 5 Q1 2012 2012-12-13 2012-03-21 1500000
## 6 Q3 2005 2005-01-04 2005-04-02 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 1 9329636 0 0 0 0
## 2 1700000 0 0 0 0
## 3 2213168 0 0 0 0
## 4 1499800 0 0 0 0
## 5 564000 0 0 0 0
## 6 47215715 0 0 0 0
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 250000 0 0 0 0 0 FALSE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 0 0 0 0 FALSE
## markets3 venture_cats real_venture
## 1 Health/Medicine/Biotech 0 to 117 million 10390367.1
## 2 Other 0 to 117 million 1822627.4
## 3 Health/Medicine/Biotech 0 to 117 million 2569208.0
## 4 <NA> 0 to 117 million 1528805.4
## 5 Other 0 to 117 million 574599.5
## 6 Internet 0 to 117 million 57915632.9
vc_fund_cats1 <- vc_data_clean3$real_venture
range(vc_data_clean3$real_venture)
## [1] 0 319412108
vc_data_clean3$venture_cats <- cut(vc_fund_cats1, 10, labels = c("0 to 32 million", "32 to 64 million","64 to 96 million","96 to 128 million","128 to 160 million","160 to 192 million","192 to 224 million","224 to 256 million","256 to 288 million","288 to 320 million"))
head(vc_data_clean3)
## name category_list
## 1 Sunbeam |Biotechnology|
## 2 SuperOx Wastewater Co |Manufacturing|
## 3 Brain Tunnelgenix Technologies |Biotechnology|
## 4 Audioscribe <NA>
## 5 Canopy Labs |Lead Generation|Analytics|
## 6 Manta |Professional Networking|Curated Web|
## market funding_total_usd status country_code state_code
## 1 Biotechnology 9329636 operating USA FL
## 2 Manufacturing 1700000 operating USA TX
## 3 Biotechnology 2563168 operating USA CT
## 4 <NA> 1499800 operating <NA> <NA>
## 5 Lead Generation 2064000 <NA> USA CA
## 6 Curated Web 47215715 operating USA OH
## region city funding_rounds founded_at founded_month
## 1 Ft. Lauderdale Fort Lauderdale 1 2007-01-01 January
## 2 Houston Houston 1 2010-01-01 January
## 3 Hartford Bridgeport 3 2006-01-01 January
## 4 <NA> <NA> 2 <NA> <NA>
## 5 SF Bay Area San Francisco 2 2012-01-01 January
## 6 Columbus, Ohio Columbus 2 2005-09-01 September
## founded_quarter founded_year first_funding_at last_funding_at seed
## 1 Q1 2007 2007-12-11 2007-12-11 0
## 2 Q1 2010 2010-05-24 2010-05-24 0
## 3 Q1 2006 2006-04-16 2006-09-17 100000
## 4 <NA> NA 2012-07-01 2012-10-01 0
## 5 Q1 2012 2012-12-13 2012-03-21 1500000
## 6 Q3 2005 2005-01-04 2005-04-02 0
## venture equity_crowdfunding undisclosed convertible_note debt_financing
## 1 9329636 0 0 0 0
## 2 1700000 0 0 0 0
## 3 2213168 0 0 0 0
## 4 1499800 0 0 0 0
## 5 564000 0 0 0 0
## 6 47215715 0 0 0 0
## angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1 0 0 0 0 0 0 FALSE
## 2 0 0 0 0 0 0 FALSE
## 3 250000 0 0 0 0 0 FALSE
## 4 0 0 0 0 0 0 FALSE
## 5 0 0 0 0 0 0 FALSE
## 6 0 0 0 0 0 0 FALSE
## markets3 venture_cats real_venture
## 1 Health/Medicine/Biotech 0 to 32 million 10390367.1
## 2 Other 0 to 32 million 1822627.4
## 3 Health/Medicine/Biotech 0 to 32 million 2569208.0
## 4 <NA> 0 to 32 million 1528805.4
## 5 Other 0 to 32 million 574599.5
## 6 Internet 32 to 64 million 57915632.9
Data Analysis
Proportion of Companies by Funding Amount
# Is the proportion of companies by funding amount equal?
fund_prop <- vc_data_clean2 %>% group_by(venture_cats) %>% summarise(num_comps = length(status)) %>% mutate(prop_comps = num_comps/sum(num_comps))
fund_prop
## # A tibble: 10 x 3
## venture_cats num_comps prop_comps
## <fct> <int> <dbl>
## 1 0 to 117 million 22154 0.986
## 2 117 to 235 million 245 0.0109
## 3 235 to 352 million 38 0.00169
## 4 352 to 470 million 11 0.000490
## 5 470 to 588 million 5 0.000223
## 6 588 to 705 million 6 0.000267
## 7 705 to 823 million 5 0.000223
## 8 1.17 to 1.3 billion 1 0.0000445
## 9 1.4 to 1.53 billion 1 0.0000445
## 10 2.23 to 2.35 billion 1 0.0000445
# Is the mean number of funding rounds the same across all funding amounts?
fund_rnd_funding <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(!is.na(funding_rounds)) %>% summarise(mean_rnds = mean(funding_rounds), sum_rnds = sum(funding_rounds)) %>% mutate(prop_rnds = sum_rnds/sum(sum_rnds))
fund_rnd_funding
## # A tibble: 10 x 4
## venture_cats mean_rnds sum_rnds prop_rnds
## <fct> <dbl> <int> <dbl>
## 1 0 to 117 million 2.10 46446 0.966
## 2 117 to 235 million 5.16 1265 0.0263
## 3 235 to 352 million 5.05 192 0.00399
## 4 352 to 470 million 4.45 49 0.00102
## 5 470 to 588 million 8 40 0.000832
## 6 588 to 705 million 5.67 34 0.000707
## 7 705 to 823 million 7.4 37 0.000769
## 8 1.17 to 1.3 billion 8 8 0.000166
## 9 1.4 to 1.53 billion 6 6 0.000125
## 10 2.23 to 2.35 billion 11 11 0.000229
ggplot(fund_rnd_funding) + geom_col(mapping = aes(x = venture_cats, y = mean_rnds)) + scale_x_discrete("Funding Amount") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Funding Amount") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

Mean Venture Funding and Funding Rounds by Status from 1913 to 1990 and from 1990 to 2014
# Does mean real venture funding (venture funding adjusted for inflation) by status vary from the 1980's onward?
venture_status1 <- vc_data_clean3 %>% group_by(status) %>% filter(!is.na(real_venture), founded_year < 1990-01-01) %>% summarise(mean_venture = mean(real_venture), sum_venture = sum(real_venture)) %>% mutate(prop_venture = sum_venture/sum(sum_venture))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_status1
## # A tibble: 3 x 4
## status mean_venture sum_venture prop_venture
## <fct> <dbl> <dbl> <dbl>
## 1 acquired 43917321. 87834642. 0.724
## 2 closed 13678213. 13678213. 0.113
## 3 operating 3970226. 19851130. 0.164
ggplot(venture_status1) + geom_col(mapping = aes(x = status, y = mean_venture)) +scale_x_discrete("Status")+ scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Funding Amount") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

venture_status2 <- vc_data_clean3 %>% group_by(status) %>% filter(!is.na(real_venture), founded_year > 1990-01-01) %>% summarise(mean_venture = mean(real_venture), sum_venture = sum(as.numeric(real_venture))) %>% mutate(prop_venture = sum_venture/sum(sum_venture))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_status2
## # A tibble: 4 x 4
## status mean_venture sum_venture prop_venture
## <fct> <dbl> <dbl> <dbl>
## 1 acquired 18859124. 697787587. 0.0955
## 2 closed 5112920. 76693795. 0.0105
## 3 operating 20876276. 6367264069. 0.872
## 4 <NA> 13578712. 162944541. 0.0223
ggplot(venture_status2) + geom_col(mapping = aes(x = status, y = mean_venture)) + scale_x_discrete("Status") + scale_y_continuous("Mean Real Venture Funding") + ggtitle("Mean Real Venture Funding by Funding Amount") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

# Does the mean number of funding rounds vary from the 1980's onward?
fund_status_rnd_time1 <- vc_data_clean %>% group_by(status) %>% filter(!is.na(funding_rounds), founded_at < 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_funding_rnds = sum(funding_rounds)) %>% mutate(prop_funding = sum_funding_rnds/sum(sum_funding_rnds))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_status_rnd_time1
## # A tibble: 4 x 4
## status mean_fund_rnd sum_funding_rnds prop_funding
## <fct> <dbl> <int> <dbl>
## 1 acquired 1.16 22 0.0603
## 2 closed 1 7 0.0192
## 3 operating 1.31 319 0.874
## 4 <NA> 1 17 0.0466
ggplot(fund_status_rnd_time1) + geom_col(mapping = aes(x = status, y = mean_fund_rnd)) + scale_x_discrete("Status") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Status") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

fund_status_rnd_time2 <- vc_data_clean %>% group_by(status) %>% filter(!is.na(funding_rounds), founded_at > 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_funding_rnds = sum(funding_rounds)) %>% mutate(prop_funding = sum_funding_rnds/sum(sum_funding_rnds))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_status_rnd_time2
## # A tibble: 4 x 4
## status mean_fund_rnd sum_funding_rnds prop_funding
## <fct> <dbl> <int> <dbl>
## 1 acquired 2.12 6268 0.0913
## 2 closed 1.45 2885 0.0420
## 3 operating 1.79 58008 0.845
## 4 <NA> 1.66 1497 0.0218
ggplot(fund_status_rnd_time2) + geom_col(mapping = aes(x = status, y = mean_fund_rnd)) + scale_x_discrete("Status") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Status") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

Mean Venture Funding and Funding Rounds by Market from 1913 to 1990 and from 1990 to 2014
# Does mean real venture funding (venture funding adjusted for inflation) by market vary from the 1980's onward?
real_venture_market_time1 <-vc_data_clean3 %>% group_by(markets3) %>% filter(!is.na(real_venture), founded_year < 1990) %>% summarise(mean_funding = mean(real_venture), sum_funding = sum(real_venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
real_venture_market_time1
## # A tibble: 4 x 4
## markets3 mean_funding sum_funding prop_funding
## <fct> <dbl> <dbl> <dbl>
## 1 Big Data Analytics/Security 18623885. 18623885. 0.133
## 2 Other 17798893. 106793360. 0.763
## 3 Health/Medicine/Biotech 13678213. 13678213. 0.0977
## 4 Energy 892412. 892412. 0.00637
ggplot(real_venture_market_time1) + geom_col(mapping = aes(x = markets3, y = mean_funding)) + scale_x_discrete("Markets") + scale_y_continuous("Mean Real Venture Funding") + ggtitle("Mean Real Venture Funding by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

real_venture_market_time2 <-vc_data_clean3 %>% group_by(markets3) %>% filter(!is.na(real_venture), founded_year > 1990) %>% summarise(mean_funding = mean(real_venture), sum_funding = sum(real_venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
real_venture_market_time2
## # A tibble: 8 x 4
## markets3 mean_funding sum_funding prop_funding
## <fct> <dbl> <dbl> <dbl>
## 1 Finance 31353700. 313536996. 0.0440
## 2 Energy 30286308. 424008306. 0.0596
## 3 Health/Medicine/Biotech 28264322. 1582802055. 0.222
## 4 Internet 19775608. 1285414501. 0.181
## 5 Big Data Analytics/Security 19613363. 274587084. 0.0386
## 6 Other 16866863. 2951700952. 0.415
## 7 Apps/Social Media 10937640. 262503366. 0.0369
## 8 <NA> 3127957. 25023657. 0.00351
ggplot(real_venture_market_time2) + geom_col(mapping = aes(x = markets3, y = mean_funding)) + scale_x_discrete("Markets") + scale_y_continuous("Mean Real Venture Funding") + ggtitle("Mean Real Venture Funding by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

# Do the markets that recieve the most funding rounds on average vary over time?
fund_rnd_market_time1 <-vc_data_clean %>% group_by(markets3) %>% filter(!is.na(funding_rounds), founded_at < 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_fund_rnd = sum(funding_rounds)) %>% mutate(prop_funding = sum_fund_rnd/sum(sum_fund_rnd)) %>% arrange(desc(mean_fund_rnd))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_market_time1
## # A tibble: 8 x 4
## markets3 mean_fund_rnd sum_fund_rnd prop_funding
## <fct> <dbl> <int> <dbl>
## 1 Big Data Analytics/Security 2.2 11 0.0301
## 2 Internet 1.55 31 0.0849
## 3 Finance 1.5 6 0.0164
## 4 Energy 1.38 11 0.0301
## 5 Other 1.27 206 0.564
## 6 Health/Medicine/Biotech 1.26 43 0.118
## 7 <NA> 1.06 55 0.151
## 8 Apps/Social Media 1 2 0.00548
ggplot(fund_rnd_market_time1) + geom_col(mapping = aes(x = markets3, y = mean_fund_rnd)) + scale_x_discrete("Market") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

fund_rnd_market_time2 <-vc_data_clean %>% group_by(markets3) %>% filter(!is.na(funding_rounds), founded_at > 1990-01-01) %>% summarise(mean_fund_rnd = mean(funding_rounds), sum_fund_rnd = sum(funding_rounds)) %>% mutate(prop_funding = sum_fund_rnd/sum(sum_fund_rnd)) %>% arrange(desc(mean_fund_rnd))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_market_time2
## # A tibble: 8 x 4
## markets3 mean_fund_rnd sum_fund_rnd prop_funding
## <fct> <dbl> <int> <dbl>
## 1 Health/Medicine/Biotech 2.16 9328 0.136
## 2 Big Data Analytics/Security 2.07 2965 0.0432
## 3 Energy 2.06 1880 0.0274
## 4 Apps/Social Media 1.82 5520 0.0804
## 5 Internet 1.80 9850 0.143
## 6 Finance 1.80 1635 0.0238
## 7 Other 1.75 35321 0.514
## 8 <NA> 1.12 2159 0.0314
ggplot(fund_rnd_market_time2) + geom_col(mapping = aes(x = markets3, y = mean_fund_rnd)) + scale_x_discrete("Market") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

Post-IPO Equity Value Among Operating Companies and Number of IPO’s Among Companies
head(vc_data_clean %>% arrange(desc(post_ipo_equity)))
## name
## 1 Clearwire
## 2 Charter Communications
## 3 Youku
## 4 Xerox
## 5 AOL
## 6 BlackBerry
## category_list
## 1 |Internet|Mobile|
## 2 <NA>
## 3 |Technology|Photography|
## 4 |Hardware + Software|
## 5 |Advertising Platforms|Content Creators|Digital Media|News|
## 6 |RIM|Wireless|Mobile|Hardware + Software|
## market funding_total_usd status country_code state_code
## 1 Internet 5700000000 acquired USA WA
## 2 <NA> 5162513431 operating USA CT
## 3 Technology 1320000000 operating CHN <NA>
## 4 Hardware + Software 1100000000 operating USA AL
## 5 Content Creators 1000000000 operating USA NY
## 6 Wireless 1000000000 operating CAN ON
## region city funding_rounds founded_at founded_month
## 1 Seattle Kirkland 4 2003-10-01 October
## 2 Hartford Stamford 2 1993-01-01 January
## 3 Shanghai Shanghai 6 2006-12-21 December
## 4 AL - Other Normal 1 1906-01-01 January
## 5 New York City New York 1 1985-05-24 May
## 6 Toronto Waterloo 1 1984-01-01 January
## founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1 Q4 2003 2003-05-06 2003-02-27 0 0.0e+00
## 2 Q1 1993 1993-11-21 1993-09-15 0 0.0e+00
## 3 Q4 2006 2006-12-21 2006-04-28 0 1.2e+08
## 4 Q1 1906 1906-03-15 1906-03-15 0 0.0e+00
## 5 Q2 1985 1985-12-01 1985-12-01 0 0.0e+00
## 6 Q1 1984 1984-11-04 1984-11-04 0 0.0e+00
## equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## private_equity post_ipo_equity post_ipo_debt secondary_market
## 1 8e+07 4700000000 9.2e+08 0
## 2 0e+00 1662513431 3.5e+09 0
## 3 0e+00 1200000000 0.0e+00 0
## 4 0e+00 1100000000 0.0e+00 0
## 5 0e+00 1000000000 0.0e+00 0
## 6 0e+00 1000000000 0.0e+00 0
## product_crowdfunding round_A round_B round_C round_D round_E
## 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0
## 3 0 3000000 12000000 25000000 40000000 40000000
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## fund_date_issue markets3
## 1 FALSE Internet
## 2 FALSE <NA>
## 3 TRUE Other
## 4 FALSE Other
## 5 FALSE Other
## 6 FALSE Other
tapply(vc_data_clean$post_ipo_equity,vc_data_clean$status,length)
## acquired closed operating
## 3692 2603 41829
## Are companies that receieve more funding rounds on average more likely to issue an IPO?
fund_rnd_num_ipo <- vc_data_clean %>% group_by(funding_rounds) %>% filter(!is.na(post_ipo_equity), post_ipo_equity > 0) %>% summarise(num_ipos = length(post_ipo_equity))
fund_rnd_num_ipo
## # A tibble: 12 x 2
## funding_rounds num_ipos
## <int> <int>
## 1 1 139
## 2 2 62
## 3 3 39
## 4 4 19
## 5 5 18
## 6 6 19
## 7 7 7
## 8 8 2
## 9 9 5
## 10 10 3
## 11 11 2
## 12 14 1
## Are companies that recieve more venture funding on average more likely to issue an IPO?
venture_num_ipo <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(!is.na(post_ipo_equity), post_ipo_equity > 0) %>% summarise(num_ipos = length(post_ipo_equity))
venture_num_ipo
## # A tibble: 3 x 2
## venture_cats num_ipos
## <fct> <int>
## 1 0 to 117 million 109
## 2 117 to 235 million 6
## 3 235 to 352 million 1
## Are operating companies that recieve more funding rounds on average more likely to have a higher post-IPO value?
op_ipo_market1 <- vc_data_clean %>% group_by(funding_rounds) %>% filter(status == "operating", !is.na(post_ipo_equity), post_ipo_equity >0) %>% summarise(mean_ipo_equity = mean(post_ipo_equity), max_ipo_equity = max(post_ipo_equity), min_ipo_equity = min(post_ipo_equity))
op_ipo_market1
## # A tibble: 12 x 4
## funding_rounds mean_ipo_equity max_ipo_equity min_ipo_equity
## <int> <dbl> <dbl> <dbl>
## 1 1 100229068. 1100000000 10500
## 2 2 84592892. 1662513431 119238
## 3 3 36121746. 211000000 800000
## 4 4 74293737 478000000 1000000
## 5 5 69530041. 264000000 2300000
## 6 6 181599171. 1200000000 3467747
## 7 7 65529415. 186000000 1303000
## 8 8 6962500 12500000 1425000
## 9 9 38406000 74700000 630000
## 10 10 90353596. 166228809 353500
## 11 11 27899452. 40000000 15798903
## 12 14 2000000 2000000 2000000
ggplot(op_ipo_market1) + geom_col(mapping = aes(x = funding_rounds, y = mean_ipo_equity)) + xlab("Rounds") + scale_y_continuous("Mean Post-IPO Equity") + ggtitle("Mean Post-IPO Equity by Funding Rounds") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

## Are operating companies that recieve more venture funding on average more likely to have a higher mean post-IPO value?
op_ipo_market_2 <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(status == "operating", !is.na(post_ipo_equity), post_ipo_equity > 0, !is.na(venture)) %>% summarise(mean_ipo_equity = mean(post_ipo_equity), max_ipo_equity = max(post_ipo_equity), min_ipo_equity = min(post_ipo_equity))
op_ipo_market_2
## # A tibble: 3 x 4
## venture_cats mean_ipo_equity max_ipo_equity min_ipo_equity
## <fct> <dbl> <dbl> <dbl>
## 1 0 to 117 million 57747995. 739265776 119238
## 2 117 to 235 million 188618915. 736000000 40000000
## 3 235 to 352 million 104478480 104478480 104478480
ggplot(op_ipo_market_2) + geom_col(mapping = aes(x = venture_cats, y = mean_ipo_equity)) + scale_x_discrete("Venture Funding") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Post-IPO Equity by Venture Funding") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

Subsetting the data to do a microanalysis of the Biotechnology market.
# unique(vc_data4$market)
Biotech <- vc_data_clean %>%
select(c(1, 3, 4, 5, 6, 7, 9, 12, 14)) %>%
filter(market == " Biotechnology ")
head(Biotech)
## name market funding_total_usd status
## 1 10X Technologies Biotechnology 3000000 operating
## 2 20/20 Gene Systems Inc. Biotechnology 150000 operating
## 3 22nd Century Group Biotechnology 15034750 operating
## 4 23andMe Biotechnology 111949900 operating
## 5 3-V Biosciences Biotechnology 78089000 operating
## 6 360imaging Biotechnology 1000000 operating
## country_code state_code city founded_month founded_year
## 1 USA CA Oakland January 2012
## 2 USA MD Rockville May 2000
## 3 USA NY Clarence January 1998
## 4 USA CA Mountain View April 2006
## 5 USA CA Menlo Park March 2007
## 6 USA GA Atlanta January 2005
view(Biotech)
The plot below is meant to visualize the status of the biotechnology companies.
plt <- Biotech %>%
ggplot()+
geom_col(aes(x = status, y = nrow(Biotech), fill = status))+
ylab("Number of Companies")+
xlab("Operational status")
plt

library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
##
## here
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
Comp_year <- count(Biotech, "founded_year") %>%
rename(c("freq" = "total_comp_founded"))
# view(Comp_year)
Comp_year <- Comp_year[-c(53), ]
summary(Comp_year)
## founded_year total_comp_founded
## Min. :1912 Min. : 1.00
## 1st Qu.:1976 1st Qu.: 1.75
## Median :1988 Median : 9.50
## Mean :1983 Mean : 47.37
## 3rd Qu.:2001 3rd Qu.: 89.25
## Max. :2014 Max. :212.00
Comp_country <- count(Biotech, "country_code") %>%
rename(c("freq" = "total_founded"))
# view(Comp_country)
summary(Comp_country)
## country_code total_founded
## ARE : 1 Min. : 1.00
## ARG : 1 1st Qu.: 1.00
## AUS : 1 Median : 6.00
## AUT : 1 Mean : 83.82
## BEL : 1 3rd Qu.: 27.00
## (Other):38 Max. :2716.00
## NA's : 1
Comp_state <- count(Biotech, "state_code") %>%
rename(c("freq" = "total_founded"))
# view(Comp_state)
summary(Comp_state)
## state_code total_founded
## AB : 1 Min. : 1.0
## AK : 1 1st Qu.: 7.0
## AL : 1 Median : 26.0
## AR : 1 Mean : 64.7
## AZ : 1 3rd Qu.: 55.0
## (Other):51 Max. :859.0
## NA's : 1
How is funding related to operational status?
The code below shows that on average there is more funding assocaited with biotechnology companies that have been acquired.
Bt2 <- Biotech %>%
select(c(3, 4)) %>%
group_by(status) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_status"))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
head(Bt2)
## # A tibble: 4 x 2
## status mfunding_per_status
## <fct> <dbl>
## 1 acquired 36644074.
## 2 closed 22510100.
## 3 operating 19728717.
## 4 <NA> 25281727.
# view(Bt2)
What is the average amount of funding on a global level, North America level, on a city level, and per year?
The following chucks of code create subsetted datasets containing the desired information.
Bt3 shows the average funding per country.
Bt4 shows the average funding per state/province in North America.
Bt5 shows the average funding per city, globally.
Bt6 shows the average funding per year.
Bt3 <- Biotech %>%
select(c(3, 5)) %>%
group_by(country_code) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_country"))
## Warning: Factor `country_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt3)
## # A tibble: 6 x 2
## country_code mfunding_per_country
## <fct> <dbl>
## 1 ARE 12000000
## 2 ARG NaN
## 3 AUS 9508229.
## 4 AUT 13802606.
## 5 BEL 33628258.
## 6 BRA 4855828.
# view(Bt3)
Bt4 <- Biotech %>%
select(c(3, 6)) %>%
group_by(state_code) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_state"))
## Warning: Factor `state_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt4)
## # A tibble: 6 x 2
## state_code mfunding_per_state
## <fct> <dbl>
## 1 AB 14295781.
## 2 AK 45000
## 3 AL 4834702.
## 4 AR 10616017.
## 5 AZ 11963997.
## 6 BC 10212936.
# view(Bt4)
Bt5 <- Biotech %>%
select(c(3, 7)) %>%
group_by(city) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mfunding_per_city"))
## Warning: Factor `city` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt5)
## # A tibble: 6 x 2
## city mfunding_per_city
## <fct> <dbl>
## 1 "\xc7an" 15270909
## 2 "\xc9cully" 7136390.
## 3 "\xc9vry" 387000
## 4 "A Coru\xf1a" 2033700
## 5 "Aachen" 7980528
## 6 "Aarhus" 5220000
# view(Bt5)
Bt6 <- Biotech %>%
select(c(3, 9)) %>%
group_by(founded_year) %>%
summarise_each(funs(mean(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "mean_funding"))
head(Bt6)
## # A tibble: 6 x 2
## founded_year mean_funding
## <int> <dbl>
## 1 1912 3500000
## 2 1919 18100000
## 3 1921 48000
## 4 1928 100000000
## 5 1945 7400000
## 6 1947 34000000
# view(Bt6)
Where are there NA values for ‘funding_total_usd’ and what countries are they associated with?
How does the mean and the total funding compare for each of the levels of interest? How does the average compare with the theoretical allocation of funds (i.e. how does the mean compare to the total funding/the number of companies founded)?
Each “Analysis” is the dataset that contains the answer to this question at the various levels of interest.
Bt7 <- Biotech %>%
select(c(3, 9)) %>%
group_by(founded_year) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sum_funding"))
head(Bt7)
## # A tibble: 6 x 2
## founded_year sum_funding
## <int> <dbl>
## 1 1912 3500000
## 2 1919 18100000
## 3 1921 48000
## 4 1928 100000000
## 5 1945 7400000
## 6 1947 34000000
# view(Bt7)
Temp <- merge(Bt6, Bt7, by = 'founded_year')
tail(Temp)
## founded_year mean_funding sum_funding
## 48 2010 12738424 2394823700
## 49 2011 11578476 1991497862
## 50 2012 8639931 1235510120
## 51 2013 11797488 967394047
## 52 2014 4942266 34595862
## 53 NA 16420586 18850832645
Analysis1 <- merge(Temp, Comp_year, by = 'founded_year') %>%
mutate(funds_per_comp = sum_funding / total_comp_founded)
tail(Analysis1)
## founded_year mean_funding sum_funding total_comp_founded funds_per_comp
## 47 2009 11337623 2165486012 195 11105056
## 48 2010 12738424 2394823700 197 12156465
## 49 2011 11578476 1991497862 180 11063877
## 50 2012 8639931 1235510120 155 7971033
## 51 2013 11797488 967394047 92 10515153
## 52 2014 4942266 34595862 9 3843985
# view(Analysis1)
Bt8 <- Biotech %>%
select(c(3, 6)) %>%
group_by(state_code) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sfunding_per_state"))
## Warning: Factor `state_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt8)
## # A tibble: 6 x 2
## state_code sfunding_per_state
## <fct> <dbl>
## 1 AB 157253596
## 2 AK 45000
## 3 AL 125702241
## 4 AR 63696100
## 5 AZ 358919918
## 6 BC 234897529
# view(Bt8)
Temp2 <- merge(Bt4, Bt8, by = 'state_code')
tail(Temp2)
## state_code mfunding_per_state sfunding_per_state
## 52 VA 16837485 656661896
## 53 WA 6329623 468392091
## 54 WI 8015495 312604315
## 55 WV 4318014 8636027
## 56 WY 1305000 1305000
## 57 <NA> 20226494 15594627186
Analysis2 <- merge(Temp2, Comp_state, by = 'state_code') %>%
mutate(funds_per_comp = sfunding_per_state / total_founded) %>%
rename(c("state_code" = "state"))
tail(Analysis2)
## state mfunding_per_state sfunding_per_state total_founded funds_per_comp
## 52 VA 16837485 656661896 45 14592487
## 53 WA 6329623 468392091 75 6245228
## 54 WI 8015495 312604315 39 8015495
## 55 WV 4318014 8636027 2 4318014
## 56 WY 1305000 1305000 1 1305000
## 57 <NA> 20226494 15594627186 859 18154397
view(Analysis2)
Bt9 <- Biotech %>%
select(c(3, 5)) %>%
group_by(country_code) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sfunding_per_country"))
## Warning: Factor `country_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt9)
## # A tibble: 6 x 2
## country_code sfunding_per_country
## <fct> <dbl>
## 1 ARE 12000000
## 2 ARG 0
## 3 AUS 285246867
## 4 AUT 165631267
## 5 BEL 672565168
## 6 BRA 24279142
# view(Bt9)
Temp3 <- merge(Bt3, Bt9, by = 'country_code')
tail(Temp3)
## country_code mfunding_per_country sfunding_per_country
## 39 THA NaN 0
## 40 TUR 47890682 191562727
## 41 TWN 88158013 264474040
## 42 USA 21222290 56111735974
## 43 ZAF 2850000 2850000
## 44 <NA> 12832482 1527065388
Analysis3 <- merge(Temp3, Comp_country, by = 'country_code') %>%
mutate(funds_per_comp = sfunding_per_country / total_founded) %>%
rename(c("country_code" = "country"))
tail(Analysis3)
## country mfunding_per_country sfunding_per_country total_founded
## 39 THA NaN 0 1
## 40 TUR 47890682 191562727 4
## 41 TWN 88158013 264474040 3
## 42 USA 21222290 56111735974 2716
## 43 ZAF 2850000 2850000 1
## 44 <NA> 12832482 1527065388 132
## funds_per_comp
## 39 0
## 40 47890682
## 41 88158013
## 42 20659697
## 43 2850000
## 44 11568677
view(Analysis3)
Comp_city <- count(Biotech, "city") %>%
rename(c("freq" = "total_founded"))
# view(Comp_state)
summary(Comp_state)
## state_code total_founded
## AB : 1 Min. : 1.0
## AK : 1 1st Qu.: 7.0
## AL : 1 Median : 26.0
## AR : 1 Mean : 64.7
## AZ : 1 3rd Qu.: 55.0
## (Other):51 Max. :859.0
## NA's : 1
Bt10 <- Biotech %>%
select(c(3, 7)) %>%
group_by(city) %>%
summarise_each(funs(sum(., na.rm = TRUE))) %>%
rename(c("funding_total_usd" = "sfunding_per_city"))
## Warning: Factor `city` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt10)
## # A tibble: 6 x 2
## city sfunding_per_city
## <fct> <dbl>
## 1 "\xc7an" 45812727
## 2 "\xc9cully" 14272781
## 3 "\xc9vry" 387000
## 4 "A Coru\xf1a" 2033700
## 5 "Aachen" 15961056
## 6 "Aarhus" 5220000
# view(Bt10)
Temp4 <- merge(Bt5, Bt10, by = 'city')
tail(Temp4)
## city mfunding_per_city sfunding_per_city
## 1037 Zevenbergen NaN 0
## 1038 Zhuhai 1762820 1762820
## 1039 Zug 2800000 2800000
## 1040 Zwijnaarde 50414506 100829011
## 1041 Zwingenberg 77766000 77766000
## 1042 <NA> 14195965 2171982642
Analysis4 <- merge(Temp4, Comp_city, by = 'city') %>%
mutate(funds_per_comp = sfunding_per_city / total_founded)
tail(Analysis4)
## city mfunding_per_city sfunding_per_city total_founded
## 1037 Zevenbergen NaN 0 1
## 1038 Zhuhai 1762820 1762820 1
## 1039 Zug 2800000 2800000 1
## 1040 Zwijnaarde 50414506 100829011 2
## 1041 Zwingenberg 77766000 77766000 1
## 1042 <NA> 14195965 2171982642 170
## funds_per_comp
## 1037 0
## 1038 1762820
## 1039 2800000
## 1040 50414506
## 1041 77766000
## 1042 12776368
view(Analysis4)
# Which global regions receive the most funding on average?
fund_global_market <- geographical_df %>% group_by(global_region) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))
nrow(fund_global_market)
## [1] 1
fund_global_market
## mean_funding
## 1 16889473
# Which global regions receive the most funding?
most_fund_global <- geographical_df %>% group_by(global_region) %>% filter(!is.na(funding_total_usd)) %>% summarise(sum_funding = sum(funding_total_usd)) %>% arrange(desc(sum_funding))
most_fund_global
## sum_funding
## 1 626025224711
nrow(most_fund_global)
## [1] 1
head(most_fund_global)
## sum_funding
## 1 626025224711
ggplot(geographical_df) + geom_col(mapping = aes(x =global_region, y = funding_total_usd)) + coord_flip()
## Warning: Removed 7075 rows containing missing values (position_stack).

most_fund_globallm <- lm(funding_total_usd ~ global_region, data = geographical_df)
summary(most_fund_globallm)
##
## Call:
## lm(formula = funding_total_usd ~ global_region, data = geographical_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.438e+07 -1.792e+07 -1.257e+07 -4.057e+06 3.006e+10
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 23401187 3468442 6.747
## global_regionBALTICS -20913797 20849675 -1.003
## global_regionC.W. OF IND. STATES 976952 9614409 0.102
## global_regionEASTERN EUROPE -21099234 11710808 -1.802
## global_regionLATIN AMER. & CARIB -19331352 7113198 -2.718
## global_regionNEAR EAST -13559844 7181416 -1.888
## global_regionNORTHERN AFRICA -12149222 28531318 -0.426
## global_regionNORTHERN AMERICA -4726565 3640021 -1.298
## global_regionOCEANIA -11891632 10477336 -1.135
## global_regionSUB-SAHARAN AFRICA -12004736 17291222 -0.694
## global_regionWESTERN EUROPE -13533753 4148532 -3.262
## Pr(>|t|)
## (Intercept) 1.53e-11 ***
## global_regionBALTICS 0.31583
## global_regionC.W. OF IND. STATES 0.91906
## global_regionEASTERN EUROPE 0.07160 .
## global_regionLATIN AMER. & CARIB 0.00658 **
## global_regionNEAR EAST 0.05901 .
## global_regionNORTHERN AFRICA 0.67024
## global_regionNORTHERN AMERICA 0.19412
## global_regionOCEANIA 0.25639
## global_regionSUB-SAHARAN AFRICA 0.48752
## global_regionWESTERN EUROPE 0.00111 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 176900000 on 37055 degrees of freedom
## (7075 observations deleted due to missingness)
## Multiple R-squared: 0.0006608, Adjusted R-squared: 0.0003911
## F-statistic: 2.45 on 10 and 37055 DF, p-value: 0.006383
anova(most_fund_globallm)
## Analysis of Variance Table
##
## Response: funding_total_usd
## Df Sum Sq Mean Sq F value Pr(>F)
## global_region 10 7.6643e+17 7.6643e+16 2.4504 0.006383 **
## Residuals 37055 1.1590e+21 3.1278e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Does gdp have a correlation with VC funding?
gdp_funding <- geographical_df %>% group_by(gdp_per_capita) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))
nrow(gdp_funding)
## [1] 1
head(gdp_funding)
## mean_funding
## 1 16889473
# Does population have a correlation with VC funding?
pop_to_funding <- geographical_df %>% group_by(population) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))
nrow(pop_to_funding)
## [1] 1
head(pop_to_funding)
## mean_funding
## 1 16889473
pop_to_fundinglm <- lm(funding_total_usd ~ population, data = geographical_df)
summary(pop_to_fundinglm)
##
## Call:
## lm(formula = funding_total_usd ~ population, data = geographical_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.779e+07 -1.655e+07 -1.225e+07 -6.283e+06 3.006e+10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.161e+07 1.375e+06 8.438 < 2e-16 ***
## population 1.993e-02 3.863e-03 5.160 2.48e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 176800000 on 37064 degrees of freedom
## (7075 observations deleted due to missingness)
## Multiple R-squared: 0.0007179, Adjusted R-squared: 0.0006909
## F-statistic: 26.63 on 1 and 37064 DF, p-value: 2.48e-07
anova(pop_to_fundinglm)
## Analysis of Variance Table
##
## Response: funding_total_usd
## Df Sum Sq Mean Sq F value Pr(>F)
## population 1 8.3261e+17 8.3261e+17 26.627 2.48e-07 ***
## Residuals 37064 1.1589e+21 3.1269e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Does population density have a correlation with VC funding?
geographical_df$pop_dens_per_mi <- as.numeric(geographical_df$pop_dens_per_mi)
class(geographical_df$pop_dens_per_mi)
## [1] "numeric"
dens_to_fundinglm <- lm(funding_total_usd ~ pop_dens_per_mi, data = geographical_df)
summary(dens_to_fundinglm)
##
## Call:
## lm(formula = funding_total_usd ~ pop_dens_per_mi, data = geographical_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.789e+07 -1.644e+07 -1.464e+07 -6.551e+06 3.006e+10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15851170 3390261 4.676 2.94e-06 ***
## pop_dens_per_mi 9817 30855 0.318 0.75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 176900000 on 37064 degrees of freedom
## (7075 observations deleted due to missingness)
## Multiple R-squared: 2.731e-06, Adjusted R-squared: -2.425e-05
## F-statistic: 0.1012 on 1 and 37064 DF, p-value: 0.7504
anova(dens_to_fundinglm)
## Analysis of Variance Table
##
## Response: funding_total_usd
## Df Sum Sq Mean Sq F value Pr(>F)
## pop_dens_per_mi 1 3.1676e+15 3.1676e+15 0.1012 0.7504
## Residuals 37064 1.1598e+21 3.1291e+16
# Does market category have a correlation with VC funding?
market_to_fundinglm <- lm(funding_total_usd ~ markets3, data = geographical_df)
summary(market_to_fundinglm)
##
## Call:
## lm(formula = funding_total_usd ~ markets3, data = geographical_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.725e+07 -1.521e+07 -1.314e+07 -5.738e+06 3.006e+10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16485786 2510528 6.567 5.22e-11 ***
## markets3Big Data Analytics/Security -330914 5428582 -0.061 0.951393
## markets3Apps/Social Media 3616481 4218785 0.857 0.391322
## markets3Energy 20768940 5840458 3.556 0.000377 ***
## markets3Health/Medicine/Biotech 4806392 3471825 1.384 0.166245
## markets3Finance 7136488 6463500 1.104 0.269548
## markets3Other -2343883 2832504 -0.827 0.407962
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 178300000 on 35215 degrees of freedom
## (8919 observations deleted due to missingness)
## Multiple R-squared: 0.0007046, Adjusted R-squared: 0.0005343
## F-statistic: 4.138 on 6 and 35215 DF, p-value: 0.0003682
anova(market_to_fundinglm)
## Analysis of Variance Table
##
## Response: funding_total_usd
## Df Sum Sq Mean Sq F value Pr(>F)
## markets3 6 7.8922e+17 1.3154e+17 4.1384 0.0003682 ***
## Residuals 35215 1.1193e+21 3.1785e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1