Data Importing and Cleaning/Tidying

Importing The Data (CPI Data and VC Investment Data from 1913 to 2014)

library(anytime)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)
library(stringr)
setwd("~/Desktop/R Datasets/Data Science Datasets")
vc_data <- read.csv("investments_VC.csv",na.strings = "",stringsAsFactors = FALSE)

library("jsonlite")
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
json_file <- 'https://datahub.io/core/cpi-us/datapackage.json'
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
## Warning in readLines(json_file): incomplete final line found on 'https://
## datahub.io/core/cpi-us/datapackage.json'
# get list of all resources:
print(json_data$resources$name)
## [1] "validation_report" "cpiai_csv"         "cpiai_json"       
## [4] "cpi-us_zip"        "cpiai"
# print all tabular data(if exists any)
for(i in 1:length(json_data$resources$datahub$type)){
  if(json_data$resources$datahub$type[i]=='derived/csv'){
    path_to_file = json_data$resources$path[i]
    data <- read.csv(url(path_to_file))
  }
}

c_code_df <- read.csv("CountryCode.csv")
c_data_df <- read.csv("CountryData.csv")

Structure, head, and tail of Data

str(vc_data)
## 'data.frame':    49438 obs. of  34 variables:
##  $ name                : chr  "#waywire" "&TV Communications" "'Rock' Your Paper" "(In)Touch Network" ...
##  $ category_list       : chr  "|Entertainment|Politics|Social Media|News|" "|Games|" "|Publishing|Education|" "|Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|" ...
##  $ market              : chr  " News " " Games " " Publishing " " Electronics " ...
##  $ funding_total_usd   : chr  " 17,50,000 " " 40,00,000 " "40,000" " 15,00,000 " ...
##  $ status              : chr  "acquired" "operating" "operating" "operating" ...
##  $ country_code        : chr  "USA" "USA" "EST" "GBR" ...
##  $ state_code          : chr  "NY" "CA" NA NA ...
##  $ region              : chr  "New York City" "Los Angeles" "Tallinn" "London" ...
##  $ city                : chr  "New York" "Los Angeles" "Tallinn" "London" ...
##  $ funding_rounds      : int  1 2 1 1 2 1 1 1 1 1 ...
##  $ founded_at          : chr  "6/1/12" NA "10/26/12" "4/1/11" ...
##  $ founded_month       : chr  "2012-06" NA "2012-10" "2011-04" ...
##  $ founded_quarter     : chr  "2012-Q2" NA "2012-Q4" "2011-Q2" ...
##  $ founded_year        : int  2012 NA 2012 2011 2014 2011 NA 2007 2010 NA ...
##  $ first_funding_at    : chr  "6/30/12" "6/4/10" "8/9/12" "4/1/11" ...
##  $ last_funding_at     : chr  "6/30/12" "9/23/10" "8/9/12" "4/1/11" ...
##  $ seed                : int  1750000 0 40000 1500000 0 0 0 0 0 41250 ...
##  $ venture             : num  0e+00 4e+06 0e+00 0e+00 0e+00 7e+06 0e+00 2e+06 0e+00 0e+00 ...
##  $ equity_crowdfunding : int  0 0 0 0 60000 0 0 0 0 0 ...
##  $ undisclosed         : int  0 0 0 0 0 0 4912393 0 0 0 ...
##  $ convertible_note    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ debt_financing      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ angel               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ grant               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ private_equity      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ post_ipo_equity     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ post_ipo_debt       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ secondary_market    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ product_crowdfunding: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ round_A             : int  0 0 0 0 0 0 0 2000000 0 0 ...
##  $ round_B             : int  0 0 0 0 0 7000000 0 0 0 0 ...
##  $ round_C             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ round_D             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ round_E             : int  0 0 0 0 0 0 0 0 0 0 ...
head(vc_data)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News         17,50,000   acquired          USA         NY
## 2        Games         40,00,000  operating          USA         CA
## 3   Publishing             40,000 operating          EST       <NA>
## 4  Electronics         15,00,000  operating          GBR       <NA>
## 5      Tourism             60,000 operating          USA         TX
## 6     Software         70,00,000       <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1     6/1/12       2012-06
## 2    Los Angeles  Los Angeles              2       <NA>          <NA>
## 3        Tallinn      Tallinn              1   10/26/12       2012-10
## 4         London       London              1     4/1/11       2011-04
## 5         Dallas   Fort Worth              2     1/1/14       2014-01
## 6 Ft. Lauderdale Oakland Park              1   10/10/11       2011-10
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1         2012-Q2         2012          6/30/12         6/30/12 1750000   0e+00
## 2            <NA>           NA           6/4/10         9/23/10       0   4e+06
## 3         2012-Q4         2012           8/9/12          8/9/12   40000   0e+00
## 4         2011-Q2         2011           4/1/11          4/1/11 1500000   0e+00
## 5         2014-Q1         2014          8/17/14         9/26/14       0   0e+00
## 6         2011-Q4         2011          5/31/13         5/31/13       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E
## 1                    0       0       0       0       0       0
## 2                    0       0       0       0       0       0
## 3                    0       0       0       0       0       0
## 4                    0       0       0       0       0       0
## 5                    0       0       0       0       0       0
## 6                    0       0 7000000       0       0       0
tail(vc_data)
##                                name
## 49433                    Zytoprotec
## 49434                         Zzish
## 49435 ZZNode Science and Technology
## 49436         Zzzzapp Wireless ltd.
## 49437                 [a]list games
## 49438                         [x+1]
##                                                       category_list
## 49433                                               |Biotechnology|
## 49434 |Analytics|Gamification|Developer APIs|iOS|Android|Education|
## 49435                                         |Enterprise Software|
## 49436                 |Web Development|Advertising|Wireless|Mobile|
## 49437                                                       |Games|
## 49438                                         |Enterprise Software|
##                      market funding_total_usd    status country_code state_code
## 49433        Biotechnology         26,86,600  operating          AUT       <NA>
## 49434            Education          3,20,000  operating          GBR       <NA>
## 49435  Enterprise Software         15,87,301  operating          CHN       <NA>
## 49436      Web Development             97,398 operating          HRV       <NA>
## 49437                Games         93,00,000  operating         <NA>       <NA>
## 49438  Enterprise Software       4,50,00,000  operating          USA         NY
##              region               city funding_rounds founded_at founded_month
## 49433        Vienna Gerasdorf Bei Wien              1     1/1/07       2007-01
## 49434        London             London              1    1/28/13       2013-01
## 49435       Beijing            Beijing              1       <NA>          <NA>
## 49436         Split              Split              5    5/13/12       2012-05
## 49437          <NA>               <NA>              1       <NA>          <NA>
## 49438 New York City           New York              4     1/1/99       1999-01
##       founded_quarter founded_year first_funding_at last_funding_at    seed
## 49433         2007-Q1         2007          1/29/13         1/29/13       0
## 49434         2013-Q1         2013          3/24/14         3/24/14  320000
## 49435            <NA>           NA           4/1/12          4/1/12       0
## 49436         2012-Q2         2012          11/1/11         9/10/14   71525
## 49437            <NA>           NA         11/21/11        11/21/11 9300000
## 49438         1999-Q1         1999           6/1/08          4/4/13       0
##        venture equity_crowdfunding undisclosed convertible_note debt_financing
## 49433  2686600                   0           0                0        0.0e+00
## 49434        0                   0           0                0        0.0e+00
## 49435  1587301                   0           0                0        0.0e+00
## 49436        0                   0           0            25873        0.0e+00
## 49437        0                   0           0                0        0.0e+00
## 49438 28000000                   0           0                0        1.7e+07
##       angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 49433     0     0              0               0             0                0
## 49434     0     0              0               0             0                0
## 49435     0     0              0               0             0                0
## 49436     0     0              0               0             0                0
## 49437     0     0              0               0             0                0
## 49438     0     0              0               0             0                0
##       product_crowdfunding  round_A  round_B round_C round_D round_E
## 49433                    0  2686600        0       0       0       0
## 49434                    0        0        0       0       0       0
## 49435                    0  1587301        0       0       0       0
## 49436                    0        0        0       0       0       0
## 49437                    0        0        0       0       0       0
## 49438                    0 16000000 10000000       0       0       0

Mutate character variables into factors and dates

vc_data2 <- vc_data %>% mutate(category_list = as.factor(category_list), market = as.factor(market), status = as.factor(status), country_code = as.factor(country_code), state_code = as.factor(state_code), city = as.factor(city), founded_at = mdy(founded_at), first_funding_at = mdy(first_funding_at), last_funding_at = mdy(last_funding_at))
## Warning: 68 failed to parse.
## Warning: 10 failed to parse.
## Warning: 6 failed to parse.
head(vc_data2)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News         17,50,000   acquired          USA         NY
## 2        Games         40,00,000  operating          USA         CA
## 3   Publishing             40,000 operating          EST       <NA>
## 4  Electronics         15,00,000  operating          GBR       <NA>
## 5      Tourism             60,000 operating          USA         TX
## 6     Software         70,00,000       <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1 2012-06-01       2012-06
## 2    Los Angeles  Los Angeles              2       <NA>          <NA>
## 3        Tallinn      Tallinn              1 2012-10-26       2012-10
## 4         London       London              1 2011-04-01       2011-04
## 5         Dallas   Fort Worth              2 2014-01-01       2014-01
## 6 Ft. Lauderdale Oakland Park              1 2011-10-10       2011-10
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1         2012-Q2         2012       2012-06-30      2012-06-30 1750000   0e+00
## 2            <NA>           NA       2010-06-04      2010-09-23       0   4e+06
## 3         2012-Q4         2012       2012-08-09      2012-08-09   40000   0e+00
## 4         2011-Q2         2011       2011-04-01      2011-04-01 1500000   0e+00
## 5         2014-Q1         2014       2014-08-17      2014-09-26       0   0e+00
## 6         2011-Q4         2011       2013-05-31      2013-05-31       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E
## 1                    0       0       0       0       0       0
## 2                    0       0       0       0       0       0
## 3                    0       0       0       0       0       0
## 4                    0       0       0       0       0       0
## 5                    0       0       0       0       0       0
## 6                    0       0 7000000       0       0       0

Identify date issues

# Identify dates where first funding occurs before company founded

fund_date_issue <- vector(mode = "logical", length = nrow(vc_data2))

for(i in 1:nrow(vc_data2)){
  if(difftime(vc_data2[i,"founded_at"],vc_data2[i,"first_funding_at"], units = "days") > 0 & !is.na(vc_data2[i,"founded_at"]) & !is.na(vc_data2[i,"first_funding_at"])){
    vc_data2[i,"first_funding_at"] = vc_data2[i,"founded_at"]
    fund_date_issue[i] = TRUE
  }
}

fund_date_issue2 <- data.frame(fund_date_issue)

vc_data3 <- cbind(vc_data2,fund_date_issue2)

head(vc_data3)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News         17,50,000   acquired          USA         NY
## 2        Games         40,00,000  operating          USA         CA
## 3   Publishing             40,000 operating          EST       <NA>
## 4  Electronics         15,00,000  operating          GBR       <NA>
## 5      Tourism             60,000 operating          USA         TX
## 6     Software         70,00,000       <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1 2012-06-01       2012-06
## 2    Los Angeles  Los Angeles              2       <NA>          <NA>
## 3        Tallinn      Tallinn              1 2012-10-26       2012-10
## 4         London       London              1 2011-04-01       2011-04
## 5         Dallas   Fort Worth              2 2014-01-01       2014-01
## 6 Ft. Lauderdale Oakland Park              1 2011-10-10       2011-10
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1         2012-Q2         2012       2012-06-30      2012-06-30 1750000   0e+00
## 2            <NA>           NA       2010-06-04      2010-09-23       0   4e+06
## 3         2012-Q4         2012       2012-10-26      2012-08-09   40000   0e+00
## 4         2011-Q2         2011       2011-04-01      2011-04-01 1500000   0e+00
## 5         2014-Q1         2014       2014-08-17      2014-09-26       0   0e+00
## 6         2011-Q4         2011       2013-05-31      2013-05-31       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3                    0       0       0       0       0       0            TRUE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0 7000000       0       0       0           FALSE
# Identify dates in the founded_at column and set their year values equal to the founded_year value

head(vc_data3 %>% arrange(desc(founded_at)))
##                                         name            category_list
## 1                                    Anacomp             |Consulting|
## 2                     Casey's General Stores             |E-Commerce|
## 3                             Gruppo Argenta |Recreation|Hospitality|
## 4          Healthcare Corporation of America          |Biotechnology|
## 5 Matthew Walker Comprehensive Health Center             |Nonprofits|
## 6                      Netsmart Technologies    |Enterprise Software|
##                  market funding_total_usd    status country_code state_code
## 1           Consulting         75,00,000   acquired          USA         CA
## 2           E-Commerce      56,90,00,000  operating          USA         IA
## 3           Recreation      12,86,60,000  operating         <NA>       <NA>
## 4        Biotechnology         91,25,000  operating          USA         TN
## 5           Nonprofits          1,73,210  operating          USA         TN
## 6  Enterprise Software         45,00,000  operating          USA         NY
##       region        city funding_rounds founded_at founded_month
## 1  San Diego   San Diego              1 2068-01-01       1968-01
## 2 Des Moines      Ankeny              1 2068-01-01       1968-01
## 3       <NA>        <NA>              1 2068-01-01       1968-01
## 4  Nashville   Nashville              2 2068-01-01       1968-01
## 5  Nashville   Nashville              1 2068-01-01       1968-01
## 6 NY - Other Great River              1 2068-01-01       1968-01
##   founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1         1968-Q1         1968       2068-01-01      2009-06-08    0 7500000
## 2         1968-Q1         1968       2068-01-01      2010-08-23    0       0
## 3         1968-Q1         1968       2068-01-01      2014-01-21    0       0
## 4         1968-Q1         1968       2068-01-01      2014-06-25    0       0
## 5         1968-Q1         1968       2068-01-01      2014-02-12    0       0
## 6         1968-Q1         1968       2068-01-01      2005-09-19    0 4500000
##   equity_crowdfunding undisclosed convertible_note debt_financing angel  grant
## 1                   0           0                0       0.00e+00     0      0
## 2                   0           0                0       5.69e+08     0      0
## 3                   0           0                0       0.00e+00     0      0
## 4                   0           0                0       3.20e+06     0      0
## 5                   0           0                0       0.00e+00     0 173210
## 6                   0           0                0       0.00e+00     0      0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3      128660000               0             0                0
## 4        5925000               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0            TRUE
## 2                    0       0       0       0       0       0            TRUE
## 3                    0       0       0       0       0       0            TRUE
## 4                    0       0       0       0       0       0            TRUE
## 5                    0       0       0       0       0       0            TRUE
## 6                    0       0       0       0       0       0            TRUE
for(i in 1:nrow(vc_data3)){
  if(!is.na(vc_data3[i,"founded_at"]) & vc_data3[i,"founded_at"] > 2014-12-13){
    year(vc_data3[i,"founded_at"]) = vc_data3[i,"founded_year"] 
    year(vc_data3[i,"first_funding_at"]) = vc_data3[i,"founded_year"]
    year(vc_data3[i,"last_funding_at"]) = vc_data3[i,"founded_year"]
  }
}

head(vc_data3)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News         17,50,000   acquired          USA         NY
## 2        Games         40,00,000  operating          USA         CA
## 3   Publishing             40,000 operating          EST       <NA>
## 4  Electronics         15,00,000  operating          GBR       <NA>
## 5      Tourism             60,000 operating          USA         TX
## 6     Software         70,00,000       <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1 2012-06-01       2012-06
## 2    Los Angeles  Los Angeles              2       <NA>          <NA>
## 3        Tallinn      Tallinn              1 2012-10-26       2012-10
## 4         London       London              1 2011-04-01       2011-04
## 5         Dallas   Fort Worth              2 2014-01-01       2014-01
## 6 Ft. Lauderdale Oakland Park              1 2011-10-10       2011-10
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1         2012-Q2         2012       2012-06-30      2012-06-30 1750000   0e+00
## 2            <NA>           NA       2010-06-04      2010-09-23       0   4e+06
## 3         2012-Q4         2012       2012-10-26      2012-08-09   40000   0e+00
## 4         2011-Q2         2011       2011-04-01      2011-04-01 1500000   0e+00
## 5         2014-Q1         2014       2014-08-17      2014-09-26       0   0e+00
## 6         2011-Q4         2011       2011-05-31      2011-05-31       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3                    0       0       0       0       0       0            TRUE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0 7000000       0       0       0           FALSE
tail(vc_data3)
##                                name
## 49433                    Zytoprotec
## 49434                         Zzish
## 49435 ZZNode Science and Technology
## 49436         Zzzzapp Wireless ltd.
## 49437                 [a]list games
## 49438                         [x+1]
##                                                       category_list
## 49433                                               |Biotechnology|
## 49434 |Analytics|Gamification|Developer APIs|iOS|Android|Education|
## 49435                                         |Enterprise Software|
## 49436                 |Web Development|Advertising|Wireless|Mobile|
## 49437                                                       |Games|
## 49438                                         |Enterprise Software|
##                      market funding_total_usd    status country_code state_code
## 49433        Biotechnology         26,86,600  operating          AUT       <NA>
## 49434            Education          3,20,000  operating          GBR       <NA>
## 49435  Enterprise Software         15,87,301  operating          CHN       <NA>
## 49436      Web Development             97,398 operating          HRV       <NA>
## 49437                Games         93,00,000  operating         <NA>       <NA>
## 49438  Enterprise Software       4,50,00,000  operating          USA         NY
##              region               city funding_rounds founded_at founded_month
## 49433        Vienna Gerasdorf Bei Wien              1 2007-01-01       2007-01
## 49434        London             London              1 2013-01-28       2013-01
## 49435       Beijing            Beijing              1       <NA>          <NA>
## 49436         Split              Split              5 2012-05-13       2012-05
## 49437          <NA>               <NA>              1       <NA>          <NA>
## 49438 New York City           New York              4 1999-01-01       1999-01
##       founded_quarter founded_year first_funding_at last_funding_at    seed
## 49433         2007-Q1         2007       2007-01-29      2007-01-29       0
## 49434         2013-Q1         2013       2013-03-24      2013-03-24  320000
## 49435            <NA>           NA       2012-04-01      2012-04-01       0
## 49436         2012-Q2         2012       2012-05-13      2012-09-10   71525
## 49437            <NA>           NA       2011-11-21      2011-11-21 9300000
## 49438         1999-Q1         1999       1999-06-01      1999-04-04       0
##        venture equity_crowdfunding undisclosed convertible_note debt_financing
## 49433  2686600                   0           0                0        0.0e+00
## 49434        0                   0           0                0        0.0e+00
## 49435  1587301                   0           0                0        0.0e+00
## 49436        0                   0           0            25873        0.0e+00
## 49437        0                   0           0                0        0.0e+00
## 49438 28000000                   0           0                0        1.7e+07
##       angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 49433     0     0              0               0             0                0
## 49434     0     0              0               0             0                0
## 49435     0     0              0               0             0                0
## 49436     0     0              0               0             0                0
## 49437     0     0              0               0             0                0
## 49438     0     0              0               0             0                0
##       product_crowdfunding  round_A  round_B round_C round_D round_E
## 49433                    0  2686600        0       0       0       0
## 49434                    0        0        0       0       0       0
## 49435                    0  1587301        0       0       0       0
## 49436                    0        0        0       0       0       0
## 49437                    0        0        0       0       0       0
## 49438                    0 16000000 10000000       0       0       0
##       fund_date_issue
## 49433           FALSE
## 49434           FALSE
## 49435           FALSE
## 49436            TRUE
## 49437           FALSE
## 49438           FALSE

Remove commas from total funding column and extract month and quarter from founded_month and founded_quarter columns

vc_data3$funding_total_usd<- as.numeric(gsub(",","",vc_data3$funding_total_usd))
## Warning: NAs introduced by coercion
vc_data4 <- vc_data3 %>% mutate(founded_month = as.integer(substr(founded_month,6,7)), founded_quarter = as.factor(substr(founded_quarter,6,7)))
head(vc_data4)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News            1750000  acquired          USA         NY
## 2        Games            4000000 operating          USA         CA
## 3   Publishing              40000 operating          EST       <NA>
## 4  Electronics            1500000 operating          GBR       <NA>
## 5      Tourism              60000 operating          USA         TX
## 6     Software            7000000      <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1 2012-06-01             6
## 2    Los Angeles  Los Angeles              2       <NA>            NA
## 3        Tallinn      Tallinn              1 2012-10-26            10
## 4         London       London              1 2011-04-01             4
## 5         Dallas   Fort Worth              2 2014-01-01             1
## 6 Ft. Lauderdale Oakland Park              1 2011-10-10            10
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1              Q2         2012       2012-06-30      2012-06-30 1750000   0e+00
## 2            <NA>           NA       2010-06-04      2010-09-23       0   4e+06
## 3              Q4         2012       2012-10-26      2012-08-09   40000   0e+00
## 4              Q2         2011       2011-04-01      2011-04-01 1500000   0e+00
## 5              Q1         2014       2014-08-17      2014-09-26       0   0e+00
## 6              Q4         2011       2011-05-31      2011-05-31       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3                    0       0       0       0       0       0            TRUE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0 7000000       0       0       0           FALSE

Change founded_month integer variable to a factor with levels equal to the names of months

month <- vector(mode = "character", length = nrow(vc_data4))

for(i in 1:nrow(vc_data4)){
  if(is.na(vc_data4[i,"founded_month"])){
    month[i] = NA
  }else if(vc_data4[i,"founded_month"] == 1){
    month[i] = "January"
  } else if(vc_data4[i,"founded_month"] == 2){
    month[i] = "February"
  } else if(vc_data4[i,"founded_month"] == 3){
    month[i] = "March"
  }else if(vc_data4[i,"founded_month"] == 4){
    month[i] = "April"
  }else if(vc_data4[i,"founded_month"] == 5){
    month[i] = "May"
  }else if(vc_data4[i,"founded_month"] == 6){
    month[i] = "June"
  }else if(vc_data4[i,"founded_month"] == 7){
    month[i] = "July"
  }else if(vc_data4[i,"founded_month"] == 8){
    month[i] = "August"
  }else if(vc_data4[i,"founded_month"] == 9){
    month[i] = "September"
  }else if(vc_data4[i,"founded_month"] == 10){
    month[i] = "October"
  }else if(vc_data4[i,"founded_month"] == 11){
    month[i] = "November"
  }else if(vc_data4[i,"founded_month"] == 12){
    month[i] = "December"
  }
}

for(i in 1:nrow(vc_data4)){
  vc_data4[i,12] = month[i]
}

head(vc_data4)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News            1750000  acquired          USA         NY
## 2        Games            4000000 operating          USA         CA
## 3   Publishing              40000 operating          EST       <NA>
## 4  Electronics            1500000 operating          GBR       <NA>
## 5      Tourism              60000 operating          USA         TX
## 6     Software            7000000      <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1 2012-06-01          June
## 2    Los Angeles  Los Angeles              2       <NA>          <NA>
## 3        Tallinn      Tallinn              1 2012-10-26       October
## 4         London       London              1 2011-04-01         April
## 5         Dallas   Fort Worth              2 2014-01-01       January
## 6 Ft. Lauderdale Oakland Park              1 2011-10-10       October
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1              Q2         2012       2012-06-30      2012-06-30 1750000   0e+00
## 2            <NA>           NA       2010-06-04      2010-09-23       0   4e+06
## 3              Q4         2012       2012-10-26      2012-08-09   40000   0e+00
## 4              Q2         2011       2011-04-01      2011-04-01 1500000   0e+00
## 5              Q1         2014       2014-08-17      2014-09-26       0   0e+00
## 6              Q4         2011       2011-05-31      2011-05-31       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3                    0       0       0       0       0       0            TRUE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0 7000000       0       0       0           FALSE
vc_data5<- vc_data4 %>% mutate(founded_month = as.factor(founded_month))
head(vc_data5)
##                 name
## 1           #waywire
## 2 &TV Communications
## 3  'Rock' Your Paper
## 4  (In)Touch Network
## 5 -R- Ranch and Mine
## 6      .Club Domains
##                                                                     category_list
## 1                                      |Entertainment|Politics|Social Media|News|
## 2                                                                         |Games|
## 3                                                          |Publishing|Education|
## 4 |Electronics|Guides|Coffee|Restaurants|Music|iPhone|Apps|Mobile|iOS|E-Commerce|
## 5                                                   |Tourism|Entertainment|Games|
## 6                                                                      |Software|
##          market funding_total_usd    status country_code state_code
## 1         News            1750000  acquired          USA         NY
## 2        Games            4000000 operating          USA         CA
## 3   Publishing              40000 operating          EST       <NA>
## 4  Electronics            1500000 operating          GBR       <NA>
## 5      Tourism              60000 operating          USA         TX
## 6     Software            7000000      <NA>          USA         FL
##           region         city funding_rounds founded_at founded_month
## 1  New York City     New York              1 2012-06-01          June
## 2    Los Angeles  Los Angeles              2       <NA>          <NA>
## 3        Tallinn      Tallinn              1 2012-10-26       October
## 4         London       London              1 2011-04-01         April
## 5         Dallas   Fort Worth              2 2014-01-01       January
## 6 Ft. Lauderdale Oakland Park              1 2011-10-10       October
##   founded_quarter founded_year first_funding_at last_funding_at    seed venture
## 1              Q2         2012       2012-06-30      2012-06-30 1750000   0e+00
## 2            <NA>           NA       2010-06-04      2010-09-23       0   4e+06
## 3              Q4         2012       2012-10-26      2012-08-09   40000   0e+00
## 4              Q2         2011       2011-04-01      2011-04-01 1500000   0e+00
## 5              Q1         2014       2014-08-17      2014-09-26       0   0e+00
## 6              Q4         2011       2011-05-31      2011-05-31       0   7e+06
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5               60000           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3                    0       0       0       0       0       0            TRUE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0 7000000       0       0       0           FALSE

Geographical Data with Cleaned VC Data

c_code_df <- subset(c_code_df, select = -c(Country.1))
c_code_df <- c_code_df %>% rename(country = Country, country_code = Country.Code)

c_data_df <- c_data_df %>% rename(country = Country, region = Region, population = Population, area_sq_mi = Area..sq..mi.., pop_dens_per_mi = Pop..Density..per.sq..mi.., gdp_per_capita = GDP....per.capita., literacy = Literacy...., phones_per_1000 = Phones..per.1000.)

geographical_df1 <- merge(c_data_df,c_code_df)

geographical_df <- merge.data.frame(vc_data_clean, geographical_df1, by.x="country_code", by.y ="country_code")
geographical_df <- geographical_df %>% rename(global_region = region.y, region = region.x)

Create categorical variable for numerical venture variable

vc_data_clean2 <- vc_data_clean %>% filter(fund_date_issue != TRUE, !is.na(venture), !is.na(first_funding_at), venture > 0)

vc_fund_cats1 <- vc_data_clean2$venture

range(vc_data_clean2$venture)
## [1] 2.200e+01 2.351e+09
vc_data_clean2$venture_cats <- cut(vc_fund_cats1, 20, labels = c("0 to 117 million", "117 to 235 million","235 to 352 million","352 to 470 million","470 to 588 million","588 to 705 million","705 to 823 million","823 to 940 million","940 million to 1 billion","1 to 1.175 billion","1.17 to 1.3 billion","1.3 to 1.4 billion","1.4 to 1.53 billion","1.53 to 1.64 billion","1.64 to 1.76 billion", "1.76 to 1.88 billion", "1.88 to 1.99 billion","1.99 to 2.11 billion","2.11 to 2.23 billion","2.23 to 2.35 billion"))

head(vc_data_clean2)
##                 name                  category_list        market
## 1 &TV Communications                        |Games|        Games 
## 2      .Club Domains                     |Software|     Software 
## 3            0-6.com                  |Curated Web|  Curated Web 
## 4    10 Minutes With                    |Education|    Education 
## 5    1000museums.com                  |Curated Web|  Curated Web 
## 6         1001 Menus |Local Businesses|Restaurants|  Restaurants 
##   funding_total_usd    status country_code state_code         region
## 1           4000000 operating          USA         CA    Los Angeles
## 2           7000000      <NA>          USA         FL Ft. Lauderdale
## 3           2000000 operating         <NA>       <NA>           <NA>
## 4           4400000 operating          GBR       <NA>         London
## 5           4962651 operating          USA         WA        Seattle
## 6           4059079 operating          FRA       <NA>          Paris
##           city funding_rounds founded_at founded_month founded_quarter
## 1  Los Angeles              2       <NA>          <NA>            <NA>
## 2 Oakland Park              1 2011-10-10       October              Q4
## 3         <NA>              1 2007-01-01       January              Q1
## 4       London              2 2013-01-01       January              Q1
## 5     Bellevue              6 2008-01-01       January              Q1
## 6        Paris              4 2010-11-20      November              Q4
##   founded_year first_funding_at last_funding_at   seed venture
## 1           NA       2010-06-04      2010-09-23      0 4000000
## 2         2011       2011-05-31      2011-05-31      0 7000000
## 3         2007       2007-03-19      2007-03-19      0 2000000
## 4         2013       2013-01-01      2013-10-09 400000 4000000
## 5         2008       2008-10-14      2008-09-19      0 3814772
## 6         2010       2010-12-15      2010-11-13 522169 3536910
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5                   0           0                0        1147879     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0 7000000       0       0       0           FALSE
## 3                    0 2000000       0       0       0       0           FALSE
## 4                    0 4000000       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0 3536910       0       0       0       0           FALSE
##   markets3     venture_cats
## 1    Other 0 to 117 million
## 2    Other 0 to 117 million
## 3 Internet 0 to 117 million
## 4    Other 0 to 117 million
## 5 Internet 0 to 117 million
## 6    Other 0 to 117 million

Take a sample of 500 observations of venture funding and adjust the nominal venture funding for inflation such that funding from any time period is expressed in 2014 prices

real_venture<-vector(mode = "numeric", length = nrow(vc_data_clean2))
vc_data_clean2 <- cbind(vc_data_clean2,real_venture)
head(vc_data_clean2)
##                 name                  category_list        market
## 1 &TV Communications                        |Games|        Games 
## 2      .Club Domains                     |Software|     Software 
## 3            0-6.com                  |Curated Web|  Curated Web 
## 4    10 Minutes With                    |Education|    Education 
## 5    1000museums.com                  |Curated Web|  Curated Web 
## 6         1001 Menus |Local Businesses|Restaurants|  Restaurants 
##   funding_total_usd    status country_code state_code         region
## 1           4000000 operating          USA         CA    Los Angeles
## 2           7000000      <NA>          USA         FL Ft. Lauderdale
## 3           2000000 operating         <NA>       <NA>           <NA>
## 4           4400000 operating          GBR       <NA>         London
## 5           4962651 operating          USA         WA        Seattle
## 6           4059079 operating          FRA       <NA>          Paris
##           city funding_rounds founded_at founded_month founded_quarter
## 1  Los Angeles              2       <NA>          <NA>            <NA>
## 2 Oakland Park              1 2011-10-10       October              Q4
## 3         <NA>              1 2007-01-01       January              Q1
## 4       London              2 2013-01-01       January              Q1
## 5     Bellevue              6 2008-01-01       January              Q1
## 6        Paris              4 2010-11-20      November              Q4
##   founded_year first_funding_at last_funding_at   seed venture
## 1           NA       2010-06-04      2010-09-23      0 4000000
## 2         2011       2011-05-31      2011-05-31      0 7000000
## 3         2007       2007-03-19      2007-03-19      0 2000000
## 4         2013       2013-01-01      2013-10-09 400000 4000000
## 5         2008       2008-10-14      2008-09-19      0 3814772
## 6         2010       2010-12-15      2010-11-13 522169 3536910
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5                   0           0                0        1147879     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1              0               0             0                0
## 2              0               0             0                0
## 3              0               0             0                0
## 4              0               0             0                0
## 5              0               0             0                0
## 6              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0 7000000       0       0       0           FALSE
## 3                    0 2000000       0       0       0       0           FALSE
## 4                    0 4000000       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0 3536910       0       0       0       0           FALSE
##   markets3     venture_cats real_venture
## 1    Other 0 to 117 million            0
## 2    Other 0 to 117 million            0
## 3 Internet 0 to 117 million            0
## 4    Other 0 to 117 million            0
## 5 Internet 0 to 117 million            0
## 6    Other 0 to 117 million            0
set.seed(123)
vc_data_clean3 <- sample_n(vc_data_clean2,500)

for(i in 1:nrow(data)){
  for(j in 1:nrow(vc_data_clean3)){
if(!is.na(vc_data_clean3[j,"first_funding_at"])  & !is.na(vc_data_clean3[j,"venture"]) & difftime(vc_data_clean3[j,"first_funding_at"], data[i,"Date"], units = "days") >= 0 & difftime(vc_data_clean3[j,"first_funding_at"], data[i,"Date"], units = "days") < 31){
    vc_data_clean3[j,"real_venture"] = vc_data_clean3[j,"venture"]*(data[1213,"Index"]/data[i,"Index"])
}
}
}

head(vc_data_clean3)
##                             name                         category_list
## 1                        Sunbeam                       |Biotechnology|
## 2          SuperOx Wastewater Co                       |Manufacturing|
## 3 Brain Tunnelgenix Technologies                       |Biotechnology|
## 4                    Audioscribe                                  <NA>
## 5                    Canopy Labs           |Lead Generation|Analytics|
## 6                          Manta |Professional Networking|Curated Web|
##              market funding_total_usd    status country_code state_code
## 1    Biotechnology            9329636 operating          USA         FL
## 2    Manufacturing            1700000 operating          USA         TX
## 3    Biotechnology            2563168 operating          USA         CT
## 4              <NA>           1499800 operating         <NA>       <NA>
## 5  Lead Generation            2064000      <NA>          USA         CA
## 6      Curated Web           47215715 operating          USA         OH
##           region            city funding_rounds founded_at founded_month
## 1 Ft. Lauderdale Fort Lauderdale              1 2007-01-01       January
## 2        Houston         Houston              1 2010-01-01       January
## 3       Hartford      Bridgeport              3 2006-01-01       January
## 4           <NA>            <NA>              2       <NA>          <NA>
## 5    SF Bay Area   San Francisco              2 2012-01-01       January
## 6 Columbus, Ohio        Columbus              2 2005-09-01     September
##   founded_quarter founded_year first_funding_at last_funding_at    seed
## 1              Q1         2007       2007-12-11      2007-12-11       0
## 2              Q1         2010       2010-05-24      2010-05-24       0
## 3              Q1         2006       2006-04-16      2006-09-17  100000
## 4            <NA>           NA       2012-07-01      2012-10-01       0
## 5              Q1         2012       2012-12-13      2012-03-21 1500000
## 6              Q3         2005       2005-01-04      2005-04-02       0
##    venture equity_crowdfunding undisclosed convertible_note debt_financing
## 1  9329636                   0           0                0              0
## 2  1700000                   0           0                0              0
## 3  2213168                   0           0                0              0
## 4  1499800                   0           0                0              0
## 5   564000                   0           0                0              0
## 6 47215715                   0           0                0              0
##   angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 1     0     0              0               0             0                0
## 2     0     0              0               0             0                0
## 3     0     0              0               0             0                0
## 4     0     0              0               0             0                0
## 5     0     0              0               0             0                0
## 6     0     0              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3               250000       0       0       0       0       0           FALSE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0       0       0       0       0           FALSE
##                  markets3     venture_cats real_venture
## 1 Health/Medicine/Biotech 0 to 117 million   10390367.1
## 2                   Other 0 to 117 million    1822627.4
## 3 Health/Medicine/Biotech 0 to 117 million    2569208.0
## 4                    <NA> 0 to 117 million    1528805.4
## 5                   Other 0 to 117 million     574599.5
## 6                Internet 0 to 117 million   57915632.9
vc_fund_cats1 <- vc_data_clean3$real_venture

range(vc_data_clean3$real_venture)
## [1]         0 319412108
vc_data_clean3$venture_cats <- cut(vc_fund_cats1, 10, labels = c("0 to 32 million", "32 to 64 million","64 to 96 million","96 to 128 million","128 to 160 million","160 to 192 million","192 to 224 million","224 to 256 million","256 to 288 million","288 to 320 million"))

head(vc_data_clean3)
##                             name                         category_list
## 1                        Sunbeam                       |Biotechnology|
## 2          SuperOx Wastewater Co                       |Manufacturing|
## 3 Brain Tunnelgenix Technologies                       |Biotechnology|
## 4                    Audioscribe                                  <NA>
## 5                    Canopy Labs           |Lead Generation|Analytics|
## 6                          Manta |Professional Networking|Curated Web|
##              market funding_total_usd    status country_code state_code
## 1    Biotechnology            9329636 operating          USA         FL
## 2    Manufacturing            1700000 operating          USA         TX
## 3    Biotechnology            2563168 operating          USA         CT
## 4              <NA>           1499800 operating         <NA>       <NA>
## 5  Lead Generation            2064000      <NA>          USA         CA
## 6      Curated Web           47215715 operating          USA         OH
##           region            city funding_rounds founded_at founded_month
## 1 Ft. Lauderdale Fort Lauderdale              1 2007-01-01       January
## 2        Houston         Houston              1 2010-01-01       January
## 3       Hartford      Bridgeport              3 2006-01-01       January
## 4           <NA>            <NA>              2       <NA>          <NA>
## 5    SF Bay Area   San Francisco              2 2012-01-01       January
## 6 Columbus, Ohio        Columbus              2 2005-09-01     September
##   founded_quarter founded_year first_funding_at last_funding_at    seed
## 1              Q1         2007       2007-12-11      2007-12-11       0
## 2              Q1         2010       2010-05-24      2010-05-24       0
## 3              Q1         2006       2006-04-16      2006-09-17  100000
## 4            <NA>           NA       2012-07-01      2012-10-01       0
## 5              Q1         2012       2012-12-13      2012-03-21 1500000
## 6              Q3         2005       2005-01-04      2005-04-02       0
##    venture equity_crowdfunding undisclosed convertible_note debt_financing
## 1  9329636                   0           0                0              0
## 2  1700000                   0           0                0              0
## 3  2213168                   0           0                0              0
## 4  1499800                   0           0                0              0
## 5   564000                   0           0                0              0
## 6 47215715                   0           0                0              0
##   angel grant private_equity post_ipo_equity post_ipo_debt secondary_market
## 1     0     0              0               0             0                0
## 2     0     0              0               0             0                0
## 3     0     0              0               0             0                0
## 4     0     0              0               0             0                0
## 5     0     0              0               0             0                0
## 6     0     0              0               0             0                0
##   product_crowdfunding round_A round_B round_C round_D round_E fund_date_issue
## 1                    0       0       0       0       0       0           FALSE
## 2                    0       0       0       0       0       0           FALSE
## 3               250000       0       0       0       0       0           FALSE
## 4                    0       0       0       0       0       0           FALSE
## 5                    0       0       0       0       0       0           FALSE
## 6                    0       0       0       0       0       0           FALSE
##                  markets3     venture_cats real_venture
## 1 Health/Medicine/Biotech  0 to 32 million   10390367.1
## 2                   Other  0 to 32 million    1822627.4
## 3 Health/Medicine/Biotech  0 to 32 million    2569208.0
## 4                    <NA>  0 to 32 million    1528805.4
## 5                   Other  0 to 32 million     574599.5
## 6                Internet 32 to 64 million   57915632.9

Data Analysis

Proportion of Companies by Funding Amount

# Is the proportion of companies by funding amount equal?

fund_prop <- vc_data_clean2 %>% group_by(venture_cats) %>% summarise(num_comps = length(status)) %>% mutate(prop_comps = num_comps/sum(num_comps))
fund_prop
## # A tibble: 10 x 3
##    venture_cats         num_comps prop_comps
##    <fct>                    <int>      <dbl>
##  1 0 to 117 million         22154  0.986    
##  2 117 to 235 million         245  0.0109   
##  3 235 to 352 million          38  0.00169  
##  4 352 to 470 million          11  0.000490 
##  5 470 to 588 million           5  0.000223 
##  6 588 to 705 million           6  0.000267 
##  7 705 to 823 million           5  0.000223 
##  8 1.17 to 1.3 billion          1  0.0000445
##  9 1.4 to 1.53 billion          1  0.0000445
## 10 2.23 to 2.35 billion         1  0.0000445
# Is the mean number of funding rounds the same across all funding amounts?

fund_rnd_funding <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(!is.na(funding_rounds)) %>% summarise(mean_rnds = mean(funding_rounds), sum_rnds = sum(funding_rounds)) %>% mutate(prop_rnds = sum_rnds/sum(sum_rnds))
fund_rnd_funding
## # A tibble: 10 x 4
##    venture_cats         mean_rnds sum_rnds prop_rnds
##    <fct>                    <dbl>    <int>     <dbl>
##  1 0 to 117 million          2.10    46446  0.966   
##  2 117 to 235 million        5.16     1265  0.0263  
##  3 235 to 352 million        5.05      192  0.00399 
##  4 352 to 470 million        4.45       49  0.00102 
##  5 470 to 588 million        8          40  0.000832
##  6 588 to 705 million        5.67       34  0.000707
##  7 705 to 823 million        7.4        37  0.000769
##  8 1.17 to 1.3 billion       8           8  0.000166
##  9 1.4 to 1.53 billion       6           6  0.000125
## 10 2.23 to 2.35 billion     11          11  0.000229
ggplot(fund_rnd_funding) + geom_col(mapping = aes(x = venture_cats, y = mean_rnds)) + scale_x_discrete("Funding Amount") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Funding Amount") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

Mean Venture Funding and Funding Rounds by Status from 1913 to 1990 and from 1990 to 2014

# Does mean real venture funding (venture funding adjusted for inflation) by status vary from the 1980's onward?

venture_status1 <- vc_data_clean3 %>% group_by(status) %>% filter(!is.na(real_venture), founded_year < 1990-01-01) %>% summarise(mean_venture = mean(real_venture), sum_venture = sum(real_venture)) %>% mutate(prop_venture = sum_venture/sum(sum_venture))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_status1
## # A tibble: 3 x 4
##   status    mean_venture sum_venture prop_venture
##   <fct>            <dbl>       <dbl>        <dbl>
## 1 acquired     43917321.   87834642.        0.724
## 2 closed       13678213.   13678213.        0.113
## 3 operating     3970226.   19851130.        0.164
ggplot(venture_status1) + geom_col(mapping = aes(x = status, y = mean_venture)) +scale_x_discrete("Status")+ scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Funding Amount") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

venture_status2 <- vc_data_clean3 %>% group_by(status) %>% filter(!is.na(real_venture), founded_year > 1990-01-01) %>% summarise(mean_venture = mean(real_venture), sum_venture = sum(as.numeric(real_venture))) %>% mutate(prop_venture = sum_venture/sum(sum_venture))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
venture_status2
## # A tibble: 4 x 4
##   status    mean_venture sum_venture prop_venture
##   <fct>            <dbl>       <dbl>        <dbl>
## 1 acquired     18859124.  697787587.       0.0955
## 2 closed        5112920.   76693795.       0.0105
## 3 operating    20876276. 6367264069.       0.872 
## 4 <NA>         13578712.  162944541.       0.0223
ggplot(venture_status2) + geom_col(mapping = aes(x = status, y = mean_venture)) + scale_x_discrete("Status") + scale_y_continuous("Mean Real Venture Funding") + ggtitle("Mean Real Venture Funding by Funding Amount") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

# Does the mean number of funding rounds vary from the 1980's onward?

fund_status_rnd_time1 <- vc_data_clean %>% group_by(status) %>% filter(!is.na(funding_rounds), founded_at < 1990-01-01) %>%  summarise(mean_fund_rnd = mean(funding_rounds), sum_funding_rnds = sum(funding_rounds)) %>% mutate(prop_funding = sum_funding_rnds/sum(sum_funding_rnds))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_status_rnd_time1
## # A tibble: 4 x 4
##   status    mean_fund_rnd sum_funding_rnds prop_funding
##   <fct>             <dbl>            <int>        <dbl>
## 1 acquired           1.16               22       0.0603
## 2 closed             1                   7       0.0192
## 3 operating          1.31              319       0.874 
## 4 <NA>               1                  17       0.0466
ggplot(fund_status_rnd_time1) + geom_col(mapping = aes(x = status, y = mean_fund_rnd)) + scale_x_discrete("Status") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Status") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

fund_status_rnd_time2 <- vc_data_clean %>% group_by(status) %>% filter(!is.na(funding_rounds), founded_at > 1990-01-01) %>%  summarise(mean_fund_rnd = mean(funding_rounds), sum_funding_rnds = sum(funding_rounds)) %>% mutate(prop_funding = sum_funding_rnds/sum(sum_funding_rnds))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_status_rnd_time2
## # A tibble: 4 x 4
##   status    mean_fund_rnd sum_funding_rnds prop_funding
##   <fct>             <dbl>            <int>        <dbl>
## 1 acquired           2.12             6268       0.0913
## 2 closed             1.45             2885       0.0420
## 3 operating          1.79            58008       0.845 
## 4 <NA>               1.66             1497       0.0218
ggplot(fund_status_rnd_time2) + geom_col(mapping = aes(x = status, y = mean_fund_rnd)) + scale_x_discrete("Status") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Status") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

Mean Venture Funding and Funding Rounds by Market from 1913 to 1990 and from 1990 to 2014

# Does mean real venture funding (venture funding adjusted for inflation) by market vary from the 1980's onward?

real_venture_market_time1 <-vc_data_clean3 %>%  group_by(markets3) %>% filter(!is.na(real_venture), founded_year < 1990) %>%  summarise(mean_funding = mean(real_venture), sum_funding = sum(real_venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
real_venture_market_time1
## # A tibble: 4 x 4
##   markets3                    mean_funding sum_funding prop_funding
##   <fct>                              <dbl>       <dbl>        <dbl>
## 1 Big Data Analytics/Security    18623885.   18623885.      0.133  
## 2 Other                          17798893.  106793360.      0.763  
## 3 Health/Medicine/Biotech        13678213.   13678213.      0.0977 
## 4 Energy                           892412.     892412.      0.00637
ggplot(real_venture_market_time1) + geom_col(mapping = aes(x = markets3, y = mean_funding)) + scale_x_discrete("Markets") + scale_y_continuous("Mean Real Venture Funding") + ggtitle("Mean Real Venture Funding by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

real_venture_market_time2 <-vc_data_clean3 %>%  group_by(markets3) %>% filter(!is.na(real_venture), founded_year > 1990) %>%  summarise(mean_funding = mean(real_venture), sum_funding = sum(real_venture)) %>% mutate(prop_funding = sum_funding/sum(sum_funding)) %>% arrange(desc(mean_funding))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
real_venture_market_time2
## # A tibble: 8 x 4
##   markets3                    mean_funding sum_funding prop_funding
##   <fct>                              <dbl>       <dbl>        <dbl>
## 1 Finance                        31353700.  313536996.      0.0440 
## 2 Energy                         30286308.  424008306.      0.0596 
## 3 Health/Medicine/Biotech        28264322. 1582802055.      0.222  
## 4 Internet                       19775608. 1285414501.      0.181  
## 5 Big Data Analytics/Security    19613363.  274587084.      0.0386 
## 6 Other                          16866863. 2951700952.      0.415  
## 7 Apps/Social Media              10937640.  262503366.      0.0369 
## 8 <NA>                            3127957.   25023657.      0.00351
ggplot(real_venture_market_time2) + geom_col(mapping = aes(x = markets3, y = mean_funding)) + scale_x_discrete("Markets") + scale_y_continuous("Mean Real Venture Funding") + ggtitle("Mean Real Venture Funding by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

# Do the markets that recieve the most funding rounds on average vary over time?

fund_rnd_market_time1 <-vc_data_clean %>%  group_by(markets3) %>% filter(!is.na(funding_rounds), founded_at < 1990-01-01) %>%  summarise(mean_fund_rnd = mean(funding_rounds), sum_fund_rnd = sum(funding_rounds)) %>% mutate(prop_funding = sum_fund_rnd/sum(sum_fund_rnd)) %>% arrange(desc(mean_fund_rnd))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_market_time1
## # A tibble: 8 x 4
##   markets3                    mean_fund_rnd sum_fund_rnd prop_funding
##   <fct>                               <dbl>        <int>        <dbl>
## 1 Big Data Analytics/Security          2.2            11      0.0301 
## 2 Internet                             1.55           31      0.0849 
## 3 Finance                              1.5             6      0.0164 
## 4 Energy                               1.38           11      0.0301 
## 5 Other                                1.27          206      0.564  
## 6 Health/Medicine/Biotech              1.26           43      0.118  
## 7 <NA>                                 1.06           55      0.151  
## 8 Apps/Social Media                    1               2      0.00548
ggplot(fund_rnd_market_time1) + geom_col(mapping = aes(x = markets3, y = mean_fund_rnd)) + scale_x_discrete("Market") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip() 

fund_rnd_market_time2 <-vc_data_clean %>%  group_by(markets3) %>% filter(!is.na(funding_rounds), founded_at > 1990-01-01) %>%  summarise(mean_fund_rnd = mean(funding_rounds), sum_fund_rnd = sum(funding_rounds)) %>% mutate(prop_funding = sum_fund_rnd/sum(sum_fund_rnd)) %>% arrange(desc(mean_fund_rnd))
## Warning: Factor `markets3` contains implicit NA, consider using
## `forcats::fct_explicit_na`
fund_rnd_market_time2
## # A tibble: 8 x 4
##   markets3                    mean_fund_rnd sum_fund_rnd prop_funding
##   <fct>                               <dbl>        <int>        <dbl>
## 1 Health/Medicine/Biotech              2.16         9328       0.136 
## 2 Big Data Analytics/Security          2.07         2965       0.0432
## 3 Energy                               2.06         1880       0.0274
## 4 Apps/Social Media                    1.82         5520       0.0804
## 5 Internet                             1.80         9850       0.143 
## 6 Finance                              1.80         1635       0.0238
## 7 Other                                1.75        35321       0.514 
## 8 <NA>                                 1.12         2159       0.0314
ggplot(fund_rnd_market_time2) + geom_col(mapping = aes(x = markets3, y = mean_fund_rnd)) + scale_x_discrete("Market") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Funding Rounds by Market") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip() 

Post-IPO Equity Value Among Operating Companies and Number of IPO’s Among Companies

head(vc_data_clean %>% arrange(desc(post_ipo_equity)))
##                     name
## 1              Clearwire
## 2 Charter Communications
## 3                  Youku
## 4                  Xerox
## 5                    AOL
## 6             BlackBerry
##                                                 category_list
## 1                                           |Internet|Mobile|
## 2                                                        <NA>
## 3                                    |Technology|Photography|
## 4                                       |Hardware + Software|
## 5 |Advertising Platforms|Content Creators|Digital Media|News|
## 6                   |RIM|Wireless|Mobile|Hardware + Software|
##                  market funding_total_usd    status country_code state_code
## 1             Internet         5700000000  acquired          USA         WA
## 2                  <NA>        5162513431 operating          USA         CT
## 3           Technology         1320000000 operating          CHN       <NA>
## 4  Hardware + Software         1100000000 operating          USA         AL
## 5     Content Creators         1000000000 operating          USA         NY
## 6             Wireless         1000000000 operating          CAN         ON
##          region     city funding_rounds founded_at founded_month
## 1       Seattle Kirkland              4 2003-10-01       October
## 2      Hartford Stamford              2 1993-01-01       January
## 3      Shanghai Shanghai              6 2006-12-21      December
## 4    AL - Other   Normal              1 1906-01-01       January
## 5 New York City New York              1 1985-05-24           May
## 6       Toronto Waterloo              1 1984-01-01       January
##   founded_quarter founded_year first_funding_at last_funding_at seed venture
## 1              Q4         2003       2003-05-06      2003-02-27    0 0.0e+00
## 2              Q1         1993       1993-11-21      1993-09-15    0 0.0e+00
## 3              Q4         2006       2006-12-21      2006-04-28    0 1.2e+08
## 4              Q1         1906       1906-03-15      1906-03-15    0 0.0e+00
## 5              Q2         1985       1985-12-01      1985-12-01    0 0.0e+00
## 6              Q1         1984       1984-11-04      1984-11-04    0 0.0e+00
##   equity_crowdfunding undisclosed convertible_note debt_financing angel grant
## 1                   0           0                0              0     0     0
## 2                   0           0                0              0     0     0
## 3                   0           0                0              0     0     0
## 4                   0           0                0              0     0     0
## 5                   0           0                0              0     0     0
## 6                   0           0                0              0     0     0
##   private_equity post_ipo_equity post_ipo_debt secondary_market
## 1          8e+07      4700000000       9.2e+08                0
## 2          0e+00      1662513431       3.5e+09                0
## 3          0e+00      1200000000       0.0e+00                0
## 4          0e+00      1100000000       0.0e+00                0
## 5          0e+00      1000000000       0.0e+00                0
## 6          0e+00      1000000000       0.0e+00                0
##   product_crowdfunding round_A  round_B  round_C  round_D  round_E
## 1                    0       0        0        0        0        0
## 2                    0       0        0        0        0        0
## 3                    0 3000000 12000000 25000000 40000000 40000000
## 4                    0       0        0        0        0        0
## 5                    0       0        0        0        0        0
## 6                    0       0        0        0        0        0
##   fund_date_issue markets3
## 1           FALSE Internet
## 2           FALSE     <NA>
## 3            TRUE    Other
## 4           FALSE    Other
## 5           FALSE    Other
## 6           FALSE    Other
tapply(vc_data_clean$post_ipo_equity,vc_data_clean$status,length)
##  acquired    closed operating 
##      3692      2603     41829
## Are companies that receieve more funding rounds on average more likely to issue an IPO? 

fund_rnd_num_ipo <- vc_data_clean %>% group_by(funding_rounds) %>% filter(!is.na(post_ipo_equity), post_ipo_equity > 0) %>% summarise(num_ipos = length(post_ipo_equity))
fund_rnd_num_ipo
## # A tibble: 12 x 2
##    funding_rounds num_ipos
##             <int>    <int>
##  1              1      139
##  2              2       62
##  3              3       39
##  4              4       19
##  5              5       18
##  6              6       19
##  7              7        7
##  8              8        2
##  9              9        5
## 10             10        3
## 11             11        2
## 12             14        1
## Are companies that recieve more venture funding on average more likely to issue an IPO?

venture_num_ipo <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(!is.na(post_ipo_equity), post_ipo_equity > 0) %>% summarise(num_ipos = length(post_ipo_equity))
venture_num_ipo
## # A tibble: 3 x 2
##   venture_cats       num_ipos
##   <fct>                 <int>
## 1 0 to 117 million        109
## 2 117 to 235 million        6
## 3 235 to 352 million        1
## Are operating companies that recieve more funding rounds on average more likely to have a higher post-IPO value? 

op_ipo_market1 <- vc_data_clean %>% group_by(funding_rounds) %>% filter(status == "operating", !is.na(post_ipo_equity), post_ipo_equity >0) %>% summarise(mean_ipo_equity = mean(post_ipo_equity), max_ipo_equity = max(post_ipo_equity), min_ipo_equity = min(post_ipo_equity))
op_ipo_market1
## # A tibble: 12 x 4
##    funding_rounds mean_ipo_equity max_ipo_equity min_ipo_equity
##             <int>           <dbl>          <dbl>          <dbl>
##  1              1      100229068.     1100000000          10500
##  2              2       84592892.     1662513431         119238
##  3              3       36121746.      211000000         800000
##  4              4       74293737       478000000        1000000
##  5              5       69530041.      264000000        2300000
##  6              6      181599171.     1200000000        3467747
##  7              7       65529415.      186000000        1303000
##  8              8        6962500        12500000        1425000
##  9              9       38406000        74700000         630000
## 10             10       90353596.      166228809         353500
## 11             11       27899452.       40000000       15798903
## 12             14        2000000         2000000        2000000
ggplot(op_ipo_market1) + geom_col(mapping = aes(x = funding_rounds, y = mean_ipo_equity)) + xlab("Rounds") + scale_y_continuous("Mean Post-IPO Equity") + ggtitle("Mean Post-IPO Equity by Funding Rounds") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

## Are operating companies that recieve more venture funding on average more likely to have a higher mean post-IPO value?

op_ipo_market_2 <- vc_data_clean2 %>% group_by(venture_cats) %>% filter(status == "operating", !is.na(post_ipo_equity), post_ipo_equity > 0, !is.na(venture)) %>% summarise(mean_ipo_equity = mean(post_ipo_equity), max_ipo_equity = max(post_ipo_equity), min_ipo_equity = min(post_ipo_equity))
op_ipo_market_2
## # A tibble: 3 x 4
##   venture_cats       mean_ipo_equity max_ipo_equity min_ipo_equity
##   <fct>                        <dbl>          <dbl>          <dbl>
## 1 0 to 117 million         57747995.      739265776         119238
## 2 117 to 235 million      188618915.      736000000       40000000
## 3 235 to 352 million      104478480       104478480      104478480
ggplot(op_ipo_market_2) + geom_col(mapping = aes(x = venture_cats, y = mean_ipo_equity))  + scale_x_discrete("Venture Funding") + scale_y_continuous("Mean Funding Rounds") + ggtitle("Mean Post-IPO Equity by Venture Funding") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip() 

Subsetting the data to do a microanalysis of the Biotechnology market.

# unique(vc_data4$market)
Biotech <- vc_data_clean %>%
  select(c(1, 3, 4, 5, 6, 7, 9, 12, 14)) %>%
  filter(market == " Biotechnology ")
head(Biotech)
##                      name          market funding_total_usd    status
## 1        10X Technologies  Biotechnology            3000000 operating
## 2 20/20 Gene Systems Inc.  Biotechnology             150000 operating
## 3      22nd Century Group  Biotechnology           15034750 operating
## 4                 23andMe  Biotechnology          111949900 operating
## 5         3-V Biosciences  Biotechnology           78089000 operating
## 6              360imaging  Biotechnology            1000000 operating
##   country_code state_code          city founded_month founded_year
## 1          USA         CA       Oakland       January         2012
## 2          USA         MD     Rockville           May         2000
## 3          USA         NY      Clarence       January         1998
## 4          USA         CA Mountain View         April         2006
## 5          USA         CA    Menlo Park         March         2007
## 6          USA         GA       Atlanta       January         2005
view(Biotech)

The plot below is meant to visualize the status of the biotechnology companies.

plt <- Biotech %>%
  ggplot()+
  geom_col(aes(x = status, y = nrow(Biotech), fill = status))+
  ylab("Number of Companies")+
  xlab("Operational status")
plt

library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
## 
##     here
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
Comp_year <- count(Biotech, "founded_year") %>%
  rename(c("freq" = "total_comp_founded"))
# view(Comp_year)
Comp_year <- Comp_year[-c(53), ]
summary(Comp_year)
##   founded_year  total_comp_founded
##  Min.   :1912   Min.   :  1.00    
##  1st Qu.:1976   1st Qu.:  1.75    
##  Median :1988   Median :  9.50    
##  Mean   :1983   Mean   : 47.37    
##  3rd Qu.:2001   3rd Qu.: 89.25    
##  Max.   :2014   Max.   :212.00
Comp_country <- count(Biotech, "country_code") %>%
  rename(c("freq" = "total_founded"))
# view(Comp_country)
summary(Comp_country)
##   country_code total_founded    
##  ARE    : 1    Min.   :   1.00  
##  ARG    : 1    1st Qu.:   1.00  
##  AUS    : 1    Median :   6.00  
##  AUT    : 1    Mean   :  83.82  
##  BEL    : 1    3rd Qu.:  27.00  
##  (Other):38    Max.   :2716.00  
##  NA's   : 1
Comp_state <- count(Biotech, "state_code") %>%
  rename(c("freq" = "total_founded"))
# view(Comp_state)
summary(Comp_state)
##    state_code total_founded  
##  AB     : 1   Min.   :  1.0  
##  AK     : 1   1st Qu.:  7.0  
##  AL     : 1   Median : 26.0  
##  AR     : 1   Mean   : 64.7  
##  AZ     : 1   3rd Qu.: 55.0  
##  (Other):51   Max.   :859.0  
##  NA's   : 1

The code below shows that on average there is more funding assocaited with biotechnology companies that have been acquired.

Bt2 <- Biotech %>%
  select(c(3, 4)) %>%
  group_by(status) %>%
  summarise_each(funs(mean(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "mfunding_per_status"))
## Warning: Factor `status` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
head(Bt2)
## # A tibble: 4 x 2
##   status    mfunding_per_status
##   <fct>                   <dbl>
## 1 acquired            36644074.
## 2 closed              22510100.
## 3 operating           19728717.
## 4 <NA>                25281727.
# view(Bt2)

What is the average amount of funding on a global level, North America level, on a city level, and per year?

The following chucks of code create subsetted datasets containing the desired information.

Bt3 shows the average funding per country.

Bt4 shows the average funding per state/province in North America.

Bt5 shows the average funding per city, globally.

Bt6 shows the average funding per year.

Bt3 <- Biotech %>%
  select(c(3, 5)) %>%
  group_by(country_code) %>%
  summarise_each(funs(mean(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "mfunding_per_country"))
## Warning: Factor `country_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt3)
## # A tibble: 6 x 2
##   country_code mfunding_per_country
##   <fct>                       <dbl>
## 1 ARE                     12000000 
## 2 ARG                          NaN 
## 3 AUS                      9508229.
## 4 AUT                     13802606.
## 5 BEL                     33628258.
## 6 BRA                      4855828.
# view(Bt3)
Bt4 <- Biotech %>%
  select(c(3, 6)) %>%
  group_by(state_code) %>%
  summarise_each(funs(mean(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "mfunding_per_state"))
## Warning: Factor `state_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt4)
## # A tibble: 6 x 2
##   state_code mfunding_per_state
##   <fct>                   <dbl>
## 1 AB                  14295781.
## 2 AK                     45000 
## 3 AL                   4834702.
## 4 AR                  10616017.
## 5 AZ                  11963997.
## 6 BC                  10212936.
# view(Bt4)
Bt5 <- Biotech %>%
  select(c(3, 7)) %>%
  group_by(city) %>%
  summarise_each(funs(mean(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "mfunding_per_city"))
## Warning: Factor `city` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt5)
## # A tibble: 6 x 2
##   city          mfunding_per_city
##   <fct>                     <dbl>
## 1 "\xc7an"              15270909 
## 2 "\xc9cully"            7136390.
## 3 "\xc9vry"               387000 
## 4 "A Coru\xf1a"          2033700 
## 5 "Aachen"               7980528 
## 6 "Aarhus"               5220000
# view(Bt5)
Bt6 <- Biotech %>%
  select(c(3, 9)) %>%
  group_by(founded_year) %>%
  summarise_each(funs(mean(., na.rm = TRUE))) %>% 
  rename(c("funding_total_usd" = "mean_funding"))
head(Bt6)
## # A tibble: 6 x 2
##   founded_year mean_funding
##          <int>        <dbl>
## 1         1912      3500000
## 2         1919     18100000
## 3         1921        48000
## 4         1928    100000000
## 5         1945      7400000
## 6         1947     34000000
# view(Bt6)

Where are there NA values for ‘funding_total_usd’ and what countries are they associated with?

The below code identifies five countries that have no funding total information associated with them. These countries are Mexico, Hungry, Argentina, Thailand, Lithuania. However, it is interesting to note that these countries did have one biotechnology company founded.

Biot <- Biotech %>%
  select(c(3, 5)) %>%
  filter(country_code == "ARG" | country_code == "HUN" | country_code == "LTU" | country_code == "MEX" | country_code == "THA")
head(Biot)
##   funding_total_usd country_code
## 1                NA          HUN
## 2                NA          ARG
## 3                NA          LTU
## 4                NA          THA
## 5                NA          MEX

How does the mean and the total funding compare for each of the levels of interest? How does the average compare with the theoretical allocation of funds (i.e. how does the mean compare to the total funding/the number of companies founded)?

Each “Analysis” is the dataset that contains the answer to this question at the various levels of interest.

Bt7 <- Biotech %>%
  select(c(3, 9)) %>%
  group_by(founded_year) %>%
  summarise_each(funs(sum(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "sum_funding"))
head(Bt7)
## # A tibble: 6 x 2
##   founded_year sum_funding
##          <int>       <dbl>
## 1         1912     3500000
## 2         1919    18100000
## 3         1921       48000
## 4         1928   100000000
## 5         1945     7400000
## 6         1947    34000000
# view(Bt7)
Temp <- merge(Bt6, Bt7, by = 'founded_year')
tail(Temp)
##    founded_year mean_funding sum_funding
## 48         2010     12738424  2394823700
## 49         2011     11578476  1991497862
## 50         2012      8639931  1235510120
## 51         2013     11797488   967394047
## 52         2014      4942266    34595862
## 53           NA     16420586 18850832645
Analysis1 <- merge(Temp, Comp_year, by = 'founded_year') %>%
  mutate(funds_per_comp = sum_funding / total_comp_founded)
tail(Analysis1)
##    founded_year mean_funding sum_funding total_comp_founded funds_per_comp
## 47         2009     11337623  2165486012                195       11105056
## 48         2010     12738424  2394823700                197       12156465
## 49         2011     11578476  1991497862                180       11063877
## 50         2012      8639931  1235510120                155        7971033
## 51         2013     11797488   967394047                 92       10515153
## 52         2014      4942266    34595862                  9        3843985
# view(Analysis1)
Bt8 <- Biotech %>%
  select(c(3, 6)) %>%
  group_by(state_code) %>%
  summarise_each(funs(sum(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "sfunding_per_state"))
## Warning: Factor `state_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt8)
## # A tibble: 6 x 2
##   state_code sfunding_per_state
##   <fct>                   <dbl>
## 1 AB                  157253596
## 2 AK                      45000
## 3 AL                  125702241
## 4 AR                   63696100
## 5 AZ                  358919918
## 6 BC                  234897529
# view(Bt8)

Temp2 <- merge(Bt4, Bt8, by = 'state_code')
tail(Temp2)
##    state_code mfunding_per_state sfunding_per_state
## 52         VA           16837485          656661896
## 53         WA            6329623          468392091
## 54         WI            8015495          312604315
## 55         WV            4318014            8636027
## 56         WY            1305000            1305000
## 57       <NA>           20226494        15594627186
Analysis2 <- merge(Temp2, Comp_state, by = 'state_code') %>%
  mutate(funds_per_comp = sfunding_per_state / total_founded) %>%
  rename(c("state_code" = "state"))
tail(Analysis2)
##    state mfunding_per_state sfunding_per_state total_founded funds_per_comp
## 52    VA           16837485          656661896            45       14592487
## 53    WA            6329623          468392091            75        6245228
## 54    WI            8015495          312604315            39        8015495
## 55    WV            4318014            8636027             2        4318014
## 56    WY            1305000            1305000             1        1305000
## 57  <NA>           20226494        15594627186           859       18154397
view(Analysis2)
Bt9 <- Biotech %>%
  select(c(3, 5)) %>%
  group_by(country_code) %>%
  summarise_each(funs(sum(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "sfunding_per_country"))
## Warning: Factor `country_code` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt9)
## # A tibble: 6 x 2
##   country_code sfunding_per_country
##   <fct>                       <dbl>
## 1 ARE                      12000000
## 2 ARG                             0
## 3 AUS                     285246867
## 4 AUT                     165631267
## 5 BEL                     672565168
## 6 BRA                      24279142
# view(Bt9)

Temp3 <- merge(Bt3, Bt9, by = 'country_code')
tail(Temp3)
##    country_code mfunding_per_country sfunding_per_country
## 39          THA                  NaN                    0
## 40          TUR             47890682            191562727
## 41          TWN             88158013            264474040
## 42          USA             21222290          56111735974
## 43          ZAF              2850000              2850000
## 44         <NA>             12832482           1527065388
Analysis3 <- merge(Temp3, Comp_country, by = 'country_code') %>%
  mutate(funds_per_comp = sfunding_per_country / total_founded) %>%
  rename(c("country_code" = "country"))
tail(Analysis3)
##    country mfunding_per_country sfunding_per_country total_founded
## 39     THA                  NaN                    0             1
## 40     TUR             47890682            191562727             4
## 41     TWN             88158013            264474040             3
## 42     USA             21222290          56111735974          2716
## 43     ZAF              2850000              2850000             1
## 44    <NA>             12832482           1527065388           132
##    funds_per_comp
## 39              0
## 40       47890682
## 41       88158013
## 42       20659697
## 43        2850000
## 44       11568677
view(Analysis3)
Comp_city <- count(Biotech, "city") %>%
  rename(c("freq" = "total_founded"))
# view(Comp_state)
summary(Comp_state)
##    state_code total_founded  
##  AB     : 1   Min.   :  1.0  
##  AK     : 1   1st Qu.:  7.0  
##  AL     : 1   Median : 26.0  
##  AR     : 1   Mean   : 64.7  
##  AZ     : 1   3rd Qu.: 55.0  
##  (Other):51   Max.   :859.0  
##  NA's   : 1
Bt10 <- Biotech %>%
  select(c(3, 7)) %>%
  group_by(city) %>%
  summarise_each(funs(sum(., na.rm = TRUE))) %>%
  rename(c("funding_total_usd" = "sfunding_per_city"))
## Warning: Factor `city` contains implicit NA, consider using
## `forcats::fct_explicit_na`
head(Bt10)
## # A tibble: 6 x 2
##   city          sfunding_per_city
##   <fct>                     <dbl>
## 1 "\xc7an"               45812727
## 2 "\xc9cully"            14272781
## 3 "\xc9vry"                387000
## 4 "A Coru\xf1a"           2033700
## 5 "Aachen"               15961056
## 6 "Aarhus"                5220000
# view(Bt10)

Temp4 <- merge(Bt5, Bt10, by = 'city')
tail(Temp4)
##             city mfunding_per_city sfunding_per_city
## 1037 Zevenbergen               NaN                 0
## 1038      Zhuhai           1762820           1762820
## 1039         Zug           2800000           2800000
## 1040  Zwijnaarde          50414506         100829011
## 1041 Zwingenberg          77766000          77766000
## 1042        <NA>          14195965        2171982642
Analysis4 <- merge(Temp4, Comp_city, by = 'city') %>%
  mutate(funds_per_comp = sfunding_per_city / total_founded)
tail(Analysis4)
##             city mfunding_per_city sfunding_per_city total_founded
## 1037 Zevenbergen               NaN                 0             1
## 1038      Zhuhai           1762820           1762820             1
## 1039         Zug           2800000           2800000             1
## 1040  Zwijnaarde          50414506         100829011             2
## 1041 Zwingenberg          77766000          77766000             1
## 1042        <NA>          14195965        2171982642           170
##      funds_per_comp
## 1037              0
## 1038        1762820
## 1039        2800000
## 1040       50414506
## 1041       77766000
## 1042       12776368
view(Analysis4)
# Which global regions receive the most funding on average? 

fund_global_market <- geographical_df %>% group_by(global_region) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))
nrow(fund_global_market)
## [1] 1
fund_global_market
##   mean_funding
## 1     16889473
# Which global regions receive the most funding?

most_fund_global <- geographical_df %>% group_by(global_region) %>% filter(!is.na(funding_total_usd)) %>% summarise(sum_funding = sum(funding_total_usd)) %>% arrange(desc(sum_funding))
most_fund_global
##    sum_funding
## 1 626025224711
nrow(most_fund_global)
## [1] 1
head(most_fund_global)
##    sum_funding
## 1 626025224711
ggplot(geographical_df) + geom_col(mapping = aes(x =global_region, y = funding_total_usd)) + coord_flip()
## Warning: Removed 7075 rows containing missing values (position_stack).

most_fund_globallm <- lm(funding_total_usd ~ global_region, data = geographical_df)
summary(most_fund_globallm)
## 
## Call:
## lm(formula = funding_total_usd ~ global_region, data = geographical_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.438e+07 -1.792e+07 -1.257e+07 -4.057e+06  3.006e+10 
## 
## Coefficients:
##                                                   Estimate Std. Error t value
## (Intercept)                                       23401187    3468442   6.747
## global_regionBALTICS                             -20913797   20849675  -1.003
## global_regionC.W. OF IND. STATES                    976952    9614409   0.102
## global_regionEASTERN EUROPE                      -21099234   11710808  -1.802
## global_regionLATIN AMER. & CARIB                 -19331352    7113198  -2.718
## global_regionNEAR EAST                           -13559844    7181416  -1.888
## global_regionNORTHERN AFRICA                     -12149222   28531318  -0.426
## global_regionNORTHERN AMERICA                     -4726565    3640021  -1.298
## global_regionOCEANIA                             -11891632   10477336  -1.135
## global_regionSUB-SAHARAN AFRICA                  -12004736   17291222  -0.694
## global_regionWESTERN EUROPE                      -13533753    4148532  -3.262
##                                                  Pr(>|t|)    
## (Intercept)                                      1.53e-11 ***
## global_regionBALTICS                              0.31583    
## global_regionC.W. OF IND. STATES                  0.91906    
## global_regionEASTERN EUROPE                       0.07160 .  
## global_regionLATIN AMER. & CARIB                  0.00658 ** 
## global_regionNEAR EAST                            0.05901 .  
## global_regionNORTHERN AFRICA                      0.67024    
## global_regionNORTHERN AMERICA                     0.19412    
## global_regionOCEANIA                              0.25639    
## global_regionSUB-SAHARAN AFRICA                   0.48752    
## global_regionWESTERN EUROPE                       0.00111 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 176900000 on 37055 degrees of freedom
##   (7075 observations deleted due to missingness)
## Multiple R-squared:  0.0006608,  Adjusted R-squared:  0.0003911 
## F-statistic:  2.45 on 10 and 37055 DF,  p-value: 0.006383
anova(most_fund_globallm)
## Analysis of Variance Table
## 
## Response: funding_total_usd
##                  Df     Sum Sq    Mean Sq F value   Pr(>F)   
## global_region    10 7.6643e+17 7.6643e+16  2.4504 0.006383 **
## Residuals     37055 1.1590e+21 3.1278e+16                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Does gdp have a correlation with VC funding?

gdp_funding <- geographical_df %>% group_by(gdp_per_capita) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))

nrow(gdp_funding)
## [1] 1
head(gdp_funding)
##   mean_funding
## 1     16889473
# Does population have a correlation with VC funding?

pop_to_funding <- geographical_df %>% group_by(population) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))
nrow(pop_to_funding)
## [1] 1
head(pop_to_funding)
##   mean_funding
## 1     16889473
pop_to_fundinglm <- lm(funding_total_usd ~ population, data = geographical_df)
summary(pop_to_fundinglm)
## 
## Call:
## lm(formula = funding_total_usd ~ population, data = geographical_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.779e+07 -1.655e+07 -1.225e+07 -6.283e+06  3.006e+10 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.161e+07  1.375e+06   8.438  < 2e-16 ***
## population  1.993e-02  3.863e-03   5.160 2.48e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 176800000 on 37064 degrees of freedom
##   (7075 observations deleted due to missingness)
## Multiple R-squared:  0.0007179,  Adjusted R-squared:  0.0006909 
## F-statistic: 26.63 on 1 and 37064 DF,  p-value: 2.48e-07
anova(pop_to_fundinglm)
## Analysis of Variance Table
## 
## Response: funding_total_usd
##               Df     Sum Sq    Mean Sq F value   Pr(>F)    
## population     1 8.3261e+17 8.3261e+17  26.627 2.48e-07 ***
## Residuals  37064 1.1589e+21 3.1269e+16                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Does population density have a correlation with VC funding? 

geographical_df$pop_dens_per_mi <- as.numeric(geographical_df$pop_dens_per_mi)
class(geographical_df$pop_dens_per_mi)
## [1] "numeric"
dens_to_fundinglm <- lm(funding_total_usd ~ pop_dens_per_mi, data = geographical_df)
summary(dens_to_fundinglm)
## 
## Call:
## lm(formula = funding_total_usd ~ pop_dens_per_mi, data = geographical_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.789e+07 -1.644e+07 -1.464e+07 -6.551e+06  3.006e+10 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     15851170    3390261   4.676 2.94e-06 ***
## pop_dens_per_mi     9817      30855   0.318     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 176900000 on 37064 degrees of freedom
##   (7075 observations deleted due to missingness)
## Multiple R-squared:  2.731e-06,  Adjusted R-squared:  -2.425e-05 
## F-statistic: 0.1012 on 1 and 37064 DF,  p-value: 0.7504
anova(dens_to_fundinglm)
## Analysis of Variance Table
## 
## Response: funding_total_usd
##                    Df     Sum Sq    Mean Sq F value Pr(>F)
## pop_dens_per_mi     1 3.1676e+15 3.1676e+15  0.1012 0.7504
## Residuals       37064 1.1598e+21 3.1291e+16
# Does market category have a correlation with VC funding?

market_to_fundinglm <- lm(funding_total_usd ~ markets3, data = geographical_df)
summary(market_to_fundinglm)
## 
## Call:
## lm(formula = funding_total_usd ~ markets3, data = geographical_df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.725e+07 -1.521e+07 -1.314e+07 -5.738e+06  3.006e+10 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         16485786    2510528   6.567 5.22e-11 ***
## markets3Big Data Analytics/Security  -330914    5428582  -0.061 0.951393    
## markets3Apps/Social Media            3616481    4218785   0.857 0.391322    
## markets3Energy                      20768940    5840458   3.556 0.000377 ***
## markets3Health/Medicine/Biotech      4806392    3471825   1.384 0.166245    
## markets3Finance                      7136488    6463500   1.104 0.269548    
## markets3Other                       -2343883    2832504  -0.827 0.407962    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 178300000 on 35215 degrees of freedom
##   (8919 observations deleted due to missingness)
## Multiple R-squared:  0.0007046,  Adjusted R-squared:  0.0005343 
## F-statistic: 4.138 on 6 and 35215 DF,  p-value: 0.0003682
anova(market_to_fundinglm)
## Analysis of Variance Table
## 
## Response: funding_total_usd
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## markets3      6 7.8922e+17 1.3154e+17  4.1384 0.0003682 ***
## Residuals 35215 1.1193e+21 3.1785e+16                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1