Raw Data

First we need to read several datasets downloaded from PitchBook, namely their Company, Fund, and Deal Investor Relation datasets. We will sythensize these datasets to create one that fits the needs of the project.

Below shows the distribution of types of funds in the raw dataset.

library(dplyr)
library(ggplot2)
library(tidyr)

# Data Import and Preprocessing
# Read datasets
fund_data <- read.csv("Data/all_funds.csv", header = TRUE)
seller <- read.csv("Data/deal_seller.csv", header = TRUE)
companies <- read.csv("Data/companies.csv", header = TRUE)
investor <- read.csv("Data/deal_investor.csv", header = TRUE)
table(fund_data$FundCategory)
## 
##         Co-Investment - General                            Debt 
##                             173                             518 
##         Fund of Funds - General                  Infrastructure 
##                              88                             133 
##                           Other                  Private Equity 
##                             301                            2864 
##       Real Assets - Real Estate Real Assets & Natural Resources 
##                             203                              94 
##           Secondaries - General                 Venture Capital 
##                              66                           15028

Filtering to only include venture funds in the United States results in 10242 funds. There are 169107 companies (from Companies dataset) and 871301 recorded deals involving investors (from Deal Investor Relation dataset).

venture_data <- subset(fund_data, FundCategory == "Venture Capital" & grepl("United States", Domiciles))

Sythensizing Datasets

The Fund dataset from PitchBook doesn’t include the companies in each fund. We must use the FundID to match deals make in the Deal Investor Relation dataset in order to find companies in each fund. We can then use the Companies dataset to find information about each company via CompanyID.

Matching Funds to Companies

# Attempt to load full_data.RData
if (file.exists("full_data.RData")) {
  load("full_data.RData")  # Load the file if it exists
} else {
  # Merge venture_data with investor
  merged_data <- merge(venture_data, investor, by.x = "FundID", by.y = "InvestorFundID")

  # Split companies by FundID
  fund_company_list <- split(merged_data[, c("CompanyID", "CompanyName")], merged_data$FundID)

  # Convert the list into a data frame
  fundid_companies <- data.frame(FundID = names(fund_company_list), Companies = I(fund_company_list), row.names = NULL)

  # Merge with venture_data
  result <- merge(fundid_companies, venture_data, by = "FundID")

  # Add full company details to the Companies column
  result$Companies <- lapply(result$Companies, function(fund_companies) {
    if (is.data.frame(fund_companies)) {
      # Join with companies dataset based on CompanyID
      merged_companies <- merge(fund_companies, companies, by = "CompanyID", all.x = TRUE)
      return(merged_companies)
    } else {
      # If not a data frame, return as is (handle edge cases)
      return(fund_companies)
    }
  })

  # Save the prepared data for future use
  save(result, file = "full_data.RData")
  message("Data preparation complete. Saved to full_data.RData.")
}

Below is a histogram showing the frequency of number of investments made per fund. Note a fund can invest in a company multiple times.

# Calculate the number of companies in each entry
result$num_investments_found <- sapply(result$Companies, function(companies) {
  if (is.data.frame(companies)) {
    nrow(companies)  # For data frames, count rows
  } else {
    length(companies$CompanyID)  # For lists, count elements in CompanyID
  }
})

# Create bins of size 10 and a special >100 bin
bins <- c(seq(0, 100, by = 10), Inf)  # Bin edges: 0-10, 11-20, ..., >100
labels <- c(paste(seq(1, 91, by = 10), seq(10, 100, by = 10), sep = "-"), ">100")
num_investments_binned <- cut(result$num_investments_found, breaks = bins, labels = labels, right = FALSE)

# Create the histogram
library(ggplot2)
ggplot(data = data.frame(NumInvestments = num_investments_binned), aes(x = NumInvestments)) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Number of Investments per Fund",
    x = "Number of Investments",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

We now distinguish between individual investments and unique companies. Below is a histogram showing frequency of number of investments in unique companies.

# Calculate the number of unique companies for each entry and add as a new column
result$num_companies_found <- sapply(result$Companies, function(companies) {
  if (is.data.frame(companies)) {
    nrow(unique(companies))  # For data frames, count unique rows
  } else {
    length(unique(companies$CompanyID))  # For lists, count unique CompanyIDs
  }
})

# Create bins of size 10 and a special >100 bin
bins <- c(seq(0, 100, by = 10), Inf)  # Bin edges: 0-10, 11-20, ..., >100
labels <- c(paste(seq(1, 91, by = 10), seq(10, 100, by = 10), sep = "-"), ">100")
num_companies_binned <- cut(result$num_companies_found, breaks = bins, labels = labels, right = FALSE)

# Create the histogram
library(ggplot2)
ggplot(data = result, aes(x = num_companies_binned)) +
  geom_bar(fill = "skyblue", color = "black") +
  labs(
    title = "Distribution of Number of Companies per Fund",
    x = "Number of Companies",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

There is a variable in the raw Fund dataset FundIvestments that counts the number of investments made by the fund. This should in theory allign with the number of investments we manually find, but doesn’t. I need to look into this more.

# Calculate the difference
result$difference_investments <- result$FundInvestments - result$num_investments_found

# Summary of differences
summary_diff <- summary(result$difference_investments)

# Count number of entries where the difference is 0, positive, or negative
count_zero <- sum(result$difference_investments == 0, na.rm = TRUE)
count_positive <- sum(result$difference_investments > 0, na.rm = TRUE)
count_negative <- sum(result$difference_investments < 0, na.rm = TRUE)

# Create a summary table
difference_summary <- data.frame(
  Category = c("Exact Match", "More Investments Found", "Fewer Investments Found"),
  Count = c(count_zero, count_positive, count_negative)
)

print(difference_summary)
##                  Category Count
## 1             Exact Match  4484
## 2  More Investments Found  5123
## 3 Fewer Investments Found   565
print(summary_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -331.000    0.000    1.000    2.877    2.000  343.000        1

Tracking Current Company Ownership State

Next, we extract the current status of the companies within each fund and create variables that represent the count of companies in each status category for each fund.

# Get the unique ownership statuses
ownership_statuses <- unique(companies$OwnershipStatus)

# Initialize columns for each ownership status in result
for (status in ownership_statuses) {
  result[[status]] <- 0
}

# Count ownership statuses for each fund, ensuring uniqueness
for (i in 1:nrow(result)) {
  fund_companies <- result$Companies[[i]]
  
  if (is.data.frame(fund_companies)) {
    # Remove duplicates by CompanyID
    unique_companies <- fund_companies[!duplicated(fund_companies$CompanyID), ]
    
    # Count the OwnershipStatus values
    status_counts <- table(unique_companies$OwnershipStatus)
    
    # Update the result with the counts
    for (status in names(status_counts)) {
      result[i, status] <- status_counts[status]
    }
  }
}

Below is a bar chart showing the distribution of the current state of all companies backed by a fund from the dataset. NOTE: if a company is backed by multiple funds it is counted multiple times.

# Aggregate data to show the total count for each ownership status
ownership_summary <- result %>%
  select(ownership_statuses) %>%
  summarise(across(everything(), sum, na.rm = TRUE)) %>%
  pivot_longer(
    cols = everything(),
    names_to = "OwnershipStatus",
    values_to = "TotalCount"
  )

# Create a bar chart
ggplot(ownership_summary, aes(x = OwnershipStatus, y = TotalCount, fill = OwnershipStatus)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Overall Distribution of Company Ownership Status",
    x = "Ownership Status",
    y = "Total Number of Companies"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for readability

We don’t only care about the current ownership status - it matters more when companies went through IPOs, as that impacts VC firm reputation and therefore fundraising.

IPOs impacting fund size

One potential topic to explore is how previous IPOs affect fund size. First we create a variable for each VC firm that records the number of IPOs from previous funds.

result <- result %>%
  group_by(Investor) %>%  # Group by VC firm
  arrange(Vintage) %>%    # Sort funds by vintage year within each firm
  mutate(
    previous_IPOs = lag(cumsum(ifelse(`Publicly Held` > 0, 1, 0)), default = 0)  # Cumulative count of IPOs
  ) %>%
  ungroup()

Next we can plot number of IPOs versus fund size, with a trend line.

ggplot(result, aes(x = previous_IPOs, y = FundSize)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +  # Add a linear trend line
  labs(
    title = "Impact of Previous IPOs on Fundraising",
    x = "Number of Previous IPOs",
    y = "Fund Size"
  ) +
  theme_minimal()

It may be easier to see the trend if we plot average fund size versus number of previous IPOs by the firm.

# Summarize average fund size by previous IPOs
ipo_summary <- result %>%
  group_by(previous_IPOs) %>%
  summarise(
    avg_fund_size = mean(FundSize, na.rm = TRUE),
    count = n()
  )

# Visualize average fund size by previous IPOs
ggplot(ipo_summary, aes(x = previous_IPOs, y = avg_fund_size)) +
  geom_col(fill = "skyblue") +
  labs(
    title = "Average Fund Size by Number of Previous IPOs",
    x = "Number of Previous IPOs",
    y = "Average Fund Size"
  ) +
  theme_minimal()

Sorting by Venture Firm

result_split <- split(result, result$Investor)

# Sort each sub-data frame by Vintage
result_sorted_split <- lapply(result_split, function(df) {
  df %>% arrange(Vintage)
})