#Analysis by global demographic data
getwd() c_code_df <- read.csv(“C:/Users/12403/Desktop/MC - Summer II/DATA101/Project1/CountryCode.csv”) c_data_df <- read.csv(“C:/Users/12403/Desktop/MC - Summer II/DATA101/Project1/CountryData.csv”)
c_code_df <- subset(c_code_df, select = -c(Country.1)) c_code_df <- c_code_df %>% rename(country = Country, country_code = Country.Code)
c_data_df <- c_data_df %>% rename(country = Country, region = Region, population = Population, area_sq_mi = Area..sq..mi.., pop_dens_per_mi = Pop..Density..per.sq..mi.., gdp_per_capita = GDP….per.capita., literacy = Literacy…., phones_per_1000 = Phones..per.1000.)
geographical_df1 <- merge(c_data_df,c_code_df)
geographical_df <- merge.data.frame(vc_data_clean, geographical_df1, by.x=“country_code”, by.y =“country_code”)
geographical_df <- geographical_df %>% rename(global_region = region.y, region = region.x) geographical_df <- geographical_df %>% rename(region = region.x)
mean_funding <- mean(geographical_df$funding_total_usd)
fund_global_market <- geographical_df %>% group_by(global_region) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding)) nrow(fund_global_market) head(fund_global_market)
ggplot(fund_global_market) + geom_col(mapping = aes(x = global_region, y =mean_funding)) + coord_flip()
#which global regions receive the most funding?
most_fund_global <- geographical_df %>% group_by(global_region) %>% filter(!is.na(funding_total_usd)) %>% summarise(funding_total_usd) %>% arrange(desc(funding_total_usd))
nrow(most_fund_global) head(most_fund_global)
ggplot(most_fund_global) + geom_col(mapping = aes(x =global_region, y = funding_total_usd)) + coord_flip()
most_fund_globallm <- lm(funding_total_usd ~ global_region, data = geographical_df) summary(most_fund_globallm) anova(most_fund_globallm)
#does gdp have a correlation with VC funding?
gdp_funding <- geographical_df %>% group_by(gdp_per_capita,) %>% filter(!is.na(funding_total_usd)) %>% summarise(mean_funding = mean(funding_total_usd)) %>% arrange(desc(mean_funding))
nrow(gdp_funding) head(gdp_funding)
gdp_to_fundinglm <-lm(funding_total_usd ~ gdp_per_capita, data = geographical_df)
summary(gdp_to_fundinglm) anova(gdp_to_fundinglm)
#does population have a correlation with VC funding?
pop_to_funding <- geographical_df %>% group_by(population) %>% filter(!is.na(funding_total_usd)) %>% summarise(funding_total_usd) %>% arrange(desc(funding_total_usd)) nrow(pop_to_funding) head(pop_to_funding)
pop_to_fundinglm <- lm(funding_total_usd ~ population, data = geographical_df) summary(pop_to_fundinglm) anova(pop_to_fundinglm)
geographical_df\(pop_dens_per_mi <- as.numeric(geographical_df\)pop_dens_per_mi) class(geographical_df$pop_dens_per_mi)
dens_to_fundinglm <- lm(funding_total_usd ~ pop_dens_per_mi, data = geographical_df) summary(dens_to_fundinglm) anova(dens_to_fundinglm)
#does literacy rate have a correlation with VC funding?
geographical_df\(literacy <- as.numeric(geographical_df\)literacy)
lit_to_fundinglm <- lm(funding_total_usd ~ literacy, data = geographical_df) summary(lit_to_fundinglm) anova(lit_to_fundinglm)
#does market category have a correlation with VC funding?
market_to_fundinglm <- lm(funding_total_usd ~ markets3, data = geographical_df) summary(market_to_fundinglm) anova(market_to_fundinglm)
#can we create a linear model that can try to explain global VC funding by demographics?
geographical_df\(phones_per_1000 <- as.numeric(geographical_df\)phones_per_1000) class(geographical_df$phones_per_1000)
global_analysislm0 <- lm(funding_total_usd ~ gdp_per_capita, data=geographical_df) summary(global_analysislm0) anova(global_analysislm1)
global_analysislm1 <- lm(funding_total_usd ~ gdp_per_capita + population, data=geographical_df) summary(global_analysislm1) anova(global_analysislm1)
global_analysislm2 <- lm(funding_total_usd ~ gdp_per_capita + population + pop_dens_per_mi, data=geographical_df) summary(global_analysislm2) anova(global_analysislm2)
global_analysislm3 <- lm(funding_total_usd ~ gdp_per_capita + population + pop_dens_per_mi + literacy, data=geographical_df) summary(global_analysislm3) anova(global_analysislm3)
global_analysislm4 <- lm(funding_total_usd ~ gdp_per_capita + population + pop_dens_per_mi + literacy + markets3, data=geographical_df) summary(global_analysislm4) anova(global_analysislm4)
global_analysislm5 <- lm(funding_total_usd ~ gdp_per_capita + population + pop_dens_per_mi + literacy + markets3 + area_sq_mi, data=geographical_df) summary(global_analysislm5) anova(global_analysislm5)
global_analysislm6 <- lm(funding_total_usd ~ gdp_per_capita + population + pop_dens_per_mi + literacy + markets3 + area_sq_mi + global_region, data=geographical_df) summary(global_analysislm6) anova(global_analysislm6)
summary(lm(funding_total_usd ~ post_ipo_equity, data = geographical_df)) anova(lm(funding_total_usd ~ post_ipo_equity, data = geographical_df))
write.csv(geographical_df, “C:/Users/12403/Desktop/MC - Summer II/DATA101/Project1/geographical.csv”)