First we need to read several datasets downloaded from PitchBook, namely their Company, Fund, and Deal Investor Relation datasets. We will sythensize these datasets to create one that fits the needs of the project.
Below shows the distribution of types of funds in the raw dataset.
library(dplyr)
library(ggplot2)
library(tidyr)
# Data Import and Preprocessing
# Read datasets
fund_data <- read.csv("Data/all_funds.csv", header = TRUE)
seller <- read.csv("Data/deal_seller.csv", header = TRUE)
companies <- read.csv("Data/companies.csv", header = TRUE)
investor <- read.csv("Data/deal_investor.csv", header = TRUE)
table(fund_data$FundCategory)##
## Co-Investment - General Debt
## 173 518
## Fund of Funds - General Infrastructure
## 88 133
## Other Private Equity
## 301 2864
## Real Assets - Real Estate Real Assets & Natural Resources
## 203 94
## Secondaries - General Venture Capital
## 66 15028
Filtering to only include venture funds in the United States results in 10242 funds. There are 169107 companies (from Companies dataset) and 871301 recorded deals involving investors (from Deal Investor Relation dataset).
The Fund dataset from PitchBook doesn’t include the companies in each fund. We must use the FundID to match deals make in the Deal Investor Relation dataset in order to find companies in each fund. We can then use the Companies dataset to find information about each company via CompanyID.
# Attempt to load full_data.RData
if (file.exists("full_data.RData")) {
load("full_data.RData") # Load the file if it exists
} else {
# Merge venture_data with investor
merged_data <- merge(venture_data, investor, by.x = "FundID", by.y = "InvestorFundID")
# Split companies by FundID
fund_company_list <- split(merged_data[, c("CompanyID", "CompanyName")], merged_data$FundID)
# Convert the list into a data frame
fundid_companies <- data.frame(FundID = names(fund_company_list), Companies = I(fund_company_list), row.names = NULL)
# Merge with venture_data
result <- merge(fundid_companies, venture_data, by = "FundID")
# Add full company details to the Companies column
result$Companies <- lapply(result$Companies, function(fund_companies) {
if (is.data.frame(fund_companies)) {
# Join with companies dataset based on CompanyID
merged_companies <- merge(fund_companies, companies, by = "CompanyID", all.x = TRUE)
return(merged_companies)
} else {
# If not a data frame, return as is (handle edge cases)
return(fund_companies)
}
})
# Save the prepared data for future use
save(result, file = "full_data.RData")
message("Data preparation complete. Saved to full_data.RData.")
}Below is a histogram showing the frequency of number of investments made per fund. Note a fund can invest in a company multiple times.
# Calculate the number of companies in each entry
result$num_investments_found <- sapply(result$Companies, function(companies) {
if (is.data.frame(companies)) {
nrow(companies) # For data frames, count rows
} else {
length(companies$CompanyID) # For lists, count elements in CompanyID
}
})
# Create bins of size 10 and a special >100 bin
bins <- c(seq(0, 100, by = 10), Inf) # Bin edges: 0-10, 11-20, ..., >100
labels <- c(paste(seq(1, 91, by = 10), seq(10, 100, by = 10), sep = "-"), ">100")
num_investments_binned <- cut(result$num_investments_found, breaks = bins, labels = labels, right = FALSE)
# Create the histogram
library(ggplot2)
ggplot(data = data.frame(NumInvestments = num_investments_binned), aes(x = NumInvestments)) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Number of Investments per Fund",
x = "Number of Investments",
y = "Frequency"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readabilityWe now distinguish between individual investments and unique companies. Below is a histogram showing frequency of number of investments in unique companies.
# Calculate the number of unique companies for each entry and add as a new column
result$num_companies_found <- sapply(result$Companies, function(companies) {
if (is.data.frame(companies)) {
nrow(unique(companies)) # For data frames, count unique rows
} else {
length(unique(companies$CompanyID)) # For lists, count unique CompanyIDs
}
})
# Create bins of size 10 and a special >100 bin
bins <- c(seq(0, 100, by = 10), Inf) # Bin edges: 0-10, 11-20, ..., >100
labels <- c(paste(seq(1, 91, by = 10), seq(10, 100, by = 10), sep = "-"), ">100")
num_companies_binned <- cut(result$num_companies_found, breaks = bins, labels = labels, right = FALSE)
# Create the histogram
library(ggplot2)
ggplot(data = result, aes(x = num_companies_binned)) +
geom_bar(fill = "skyblue", color = "black") +
labs(
title = "Distribution of Number of Companies per Fund",
x = "Number of Companies",
y = "Frequency"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readabilityThere is a variable in the raw Fund dataset FundIvestments that counts the number of investments made by the fund. This should in theory allign with the number of investments we manually find, but doesn’t. I need to look into this more.
# Calculate the difference
result$difference_investments <- result$FundInvestments - result$num_investments_found
# Summary of differences
summary_diff <- summary(result$difference_investments)
# Count number of entries where the difference is 0, positive, or negative
count_zero <- sum(result$difference_investments == 0, na.rm = TRUE)
count_positive <- sum(result$difference_investments > 0, na.rm = TRUE)
count_negative <- sum(result$difference_investments < 0, na.rm = TRUE)
# Create a summary table
difference_summary <- data.frame(
Category = c("Exact Match", "More Investments Found", "Fewer Investments Found"),
Count = c(count_zero, count_positive, count_negative)
)
print(difference_summary)## Category Count
## 1 Exact Match 4484
## 2 More Investments Found 5123
## 3 Fewer Investments Found 565
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -331.000 0.000 1.000 2.877 2.000 343.000 1
Next, we extract the current status of the companies within each fund and create variables that represent the count of companies in each status category for each fund.
# Get the unique ownership statuses
ownership_statuses <- unique(companies$OwnershipStatus)
# Initialize columns for each ownership status in result
for (status in ownership_statuses) {
result[[status]] <- 0
}
# Count ownership statuses for each fund, ensuring uniqueness
for (i in 1:nrow(result)) {
fund_companies <- result$Companies[[i]]
if (is.data.frame(fund_companies)) {
# Remove duplicates by CompanyID
unique_companies <- fund_companies[!duplicated(fund_companies$CompanyID), ]
# Count the OwnershipStatus values
status_counts <- table(unique_companies$OwnershipStatus)
# Update the result with the counts
for (status in names(status_counts)) {
result[i, status] <- status_counts[status]
}
}
}Below is a bar chart showing the distribution of the current state of all companies backed by a fund from the dataset. NOTE: if a company is backed by multiple funds it is counted multiple times.
# Aggregate data to show the total count for each ownership status
ownership_summary <- result %>%
select(ownership_statuses) %>%
summarise(across(everything(), sum, na.rm = TRUE)) %>%
pivot_longer(
cols = everything(),
names_to = "OwnershipStatus",
values_to = "TotalCount"
)
# Create a bar chart
ggplot(ownership_summary, aes(x = OwnershipStatus, y = TotalCount, fill = OwnershipStatus)) +
geom_bar(stat = "identity") +
labs(
title = "Overall Distribution of Company Ownership Status",
x = "Ownership Status",
y = "Total Number of Companies"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
We don’t only care about the current ownership status - it matters more
when companies went through IPOs, as that impacts VC firm reputation and
therefore fundraising.
One potential topic to explore is how previous IPOs affect fund size. First we create a variable for each VC firm that records the number of IPOs from previous funds.
result <- result %>%
group_by(Investor) %>% # Group by VC firm
arrange(Vintage) %>% # Sort funds by vintage year within each firm
mutate(
previous_IPOs = lag(cumsum(ifelse(`Publicly Held` > 0, 1, 0)), default = 0) # Cumulative count of IPOs
) %>%
ungroup()Next we can plot number of IPOs versus fund size, with a trend line.
ggplot(result, aes(x = previous_IPOs, y = FundSize)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "blue") + # Add a linear trend line
labs(
title = "Impact of Previous IPOs on Fundraising",
x = "Number of Previous IPOs",
y = "Fund Size"
) +
theme_minimal()It may be easier to see the trend if we plot average fund size versus number of previous IPOs by the firm.
# Summarize average fund size by previous IPOs
ipo_summary <- result %>%
group_by(previous_IPOs) %>%
summarise(
avg_fund_size = mean(FundSize, na.rm = TRUE),
count = n()
)
# Visualize average fund size by previous IPOs
ggplot(ipo_summary, aes(x = previous_IPOs, y = avg_fund_size)) +
geom_col(fill = "skyblue") +
labs(
title = "Average Fund Size by Number of Previous IPOs",
x = "Number of Previous IPOs",
y = "Average Fund Size"
) +
theme_minimal()