# establish the general directory data is saved in
getwd()
## [1] "/cloud/project"
# establish the specific directory data is saved in
list.files("/cloud/project/data")
## [1] "entrepreneurs_5000.csv"
# load the data
ent <- read.csv("/cloud/project/data/entrepreneurs_5000.csv")
head(ent)
## Entrepreneur_ID Age Gender Annual_Income_USD BusinessStartDate BusinessType
## 1 ENT100000 39 Male 2447.42 2020-09-03 Retail
## 2 ENT100001 34 Male 3649.35 2021-04-06 Retail
## 3 ENT100002 41 Male 2219.63 2013-03-24 Retail
## 4 ENT100003 49 Male 1233.31 2020-07-22 Retail
## 5 ENT100004 33 Male 4171.77 2025-05-27 Services
## 6 ENT100005 33 Female 533.88 2023-05-06 Tech
## EducationLevel YearsExperience Employees Region FundingReceived
## 1 Bachelors 21 1 Mbarara 0
## 2 Masters 14 6 Kampala 1
## 3 Diploma 22 1 Jinja 0
## 4 Bachelors 24 7 Other 0
## 5 Masters 11 2 Kampala 0
## 6 Secondary 12 4 Other 0
## BusinessStage
## 1 Startup
## 2 Growth
## 3 Startup
## 4 Scale
## 5 Growth
## 6 Idea
# load packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
#What are the mean, median, and standard deviation of annual income among entrepreneurs?
#step1: what is the mean of annual income
library(dplyr)
regional_income <- ent %>%
group_by (Region) %>%
summarise(mean_income=mean(Annual_Income_USD,na.rm=TRUE),.groups='drop')
print(regional_income)
## # A tibble: 7 × 2
## Region mean_income
## <chr> <dbl>
## 1 Entebbe 3369.
## 2 Gulu 3315.
## 3 Jinja 3350.
## 4 Kampala 3348.
## 5 Mbale 3669.
## 6 Mbarara 3353.
## 7 Other 3402.
#step2: what is teh median of annual income
med_income <- ent %>%
group_by(Region) %>%
summarise(median_income=median(Annual_Income_USD,na.rm=TRUE),.groups='drop')
print(med_income)
## # A tibble: 7 × 2
## Region median_income
## <chr> <dbl>
## 1 Entebbe 2555.
## 2 Gulu 2328.
## 3 Jinja 2499.
## 4 Kampala 2530.
## 5 Mbale 2558.
## 6 Mbarara 2543.
## 7 Other 2668.
#step3: what is the standard deviation of annual income
std_income <- ent %>%
summarise(
avg_income = mean(Annual_Income_USD),
std_dev = sd(Annual_Income_USD),
.groups='drop')
print(std_income)
## avg_income std_dev
## 1 3381.903 2944.045
#Which region has the highest average number of employees
employee_region <- ent %>%
group_by(Region) %>%
summarise(count=n()) %>%
arrange(desc(count))
print(employee_region)
## # A tibble: 7 × 2
## Region count
## <chr> <int>
## 1 Kampala 2212
## 2 Other 778
## 3 Jinja 527
## 4 Mbarara 522
## 5 Mbale 402
## 6 Gulu 311
## 7 Entebbe 248
# visualize results
library(ggplot2)
ggplot(data=employee_region,mapping=aes(x=Region,y=count))+
geom_col(fill='red', color='black')+labs(title='Employees by Region')

#What is the frequency distribution of business stages across the data
ggplot(data=ent,mapping=aes(x=BusinessStage))+
geom_bar(color='black',fill='lightblue')+
labs(title='Frequency Distribution of Business Stage')

#What is the relationship between Years of Experience and Annual Income?
ggplot(data=ent,mapping=aes(x=YearsExperience,y=Annual_Income_USD))+
geom_point(alpha=0.6)+labs(title='Relationship between Years of Experience and Annual Income')

#what is the correlation between the two?
correlation <- cor(ent$YearsExperience,ent$Annual_Income_USD,use='complete.obs')
print(correlation)
## [1] -0.0105347
# is there a significant difference in Annual Income between male & female entrepreneurs.
anova_result <- aov(Annual_Income_USD~Gender,data=ent)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender 3 8.971e+06 2990175 0.345 0.793
## Residuals 4996 4.332e+10 8670810
# Is there an association between Business Type and Business Stage?
# step1: create a contingency table
tbl <- table(ent$BusinessType,ent$BusinessStage)
# step2: calculate degree of association
chi_result <- chisq.test(tbl)
# step3: print results
print(chi_result)
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 6.5404, df = 15, p-value = 0.9692
#Build a model to predict Annual Income using Years of Experience, Education Level, and Funding Received as predictors.
linear_model <- lm(Annual_Income_USD~YearsExperience+EducationLevel+FundingReceived,data=ent)
summary(linear_model)
##
## Call:
## lm(formula = Annual_Income_USD ~ YearsExperience + EducationLevel +
## FundingReceived, data = ent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4162.2 -1764.3 -784.2 801.5 26962.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3280.678 98.100 33.442 <2e-16 ***
## YearsExperience -6.815 4.782 -1.425 0.1542
## EducationLevelDiploma -9.453 116.268 -0.081 0.9352
## EducationLevelMasters 308.131 131.442 2.344 0.0191 *
## EducationLevelNone -254.770 200.295 -1.272 0.2034
## EducationLevelPhD 646.205 298.168 2.167 0.0303 *
## EducationLevelSecondary -64.259 107.938 -0.595 0.5516
## FundingReceived 1159.952 115.770 10.019 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2912 on 4992 degrees of freedom
## Multiple R-squared: 0.02273, Adjusted R-squared: 0.02136
## F-statistic: 16.59 on 7 and 4992 DF, p-value: < 2.2e-16
#Model how business stage is influenced by Funding Received, Education Level, and Years of Experience.
#step1: convert dependent variable to factor
ent$BusinessStage <- as.factor(ent$BusinessStage)
#step2: build logistical model
library(nnet)
logistical_model <- multinom(BusinessStage~FundingReceived+EducationLevel+YearsExperience,data=ent)
## # weights: 36 (24 variable)
## initial value 6931.471806
## iter 10 value 5785.468307
## iter 20 value 5751.746532
## iter 30 value 5746.536437
## iter 30 value 5746.536433
## iter 30 value 5746.536433
## final value 5746.536433
## converged
summary(logistical_model)
## Call:
## multinom(formula = BusinessStage ~ FundingReceived + EducationLevel +
## YearsExperience, data = ent)
##
## Coefficients:
## (Intercept) FundingReceived EducationLevelDiploma EducationLevelMasters
## Idea -1.0562697 -0.7037304 0.090522307 0.06175435
## Scale -1.7517005 0.7074379 0.179923929 0.09960700
## Startup 0.3064338 -0.7837916 -0.009836127 -0.02273110
## EducationLevelNone EducationLevelPhD EducationLevelSecondary
## Idea 0.005613086 0.3953405 0.12767659
## Scale 0.384739975 0.3261484 0.12388471
## Startup -0.193947854 0.2250051 0.08686899
## YearsExperience
## Idea 0.001241928
## Scale -0.007138057
## Startup 0.006055797
##
## Std. Errors:
## (Intercept) FundingReceived EducationLevelDiploma EducationLevelMasters
## Idea 0.11389880 0.14693907 0.1348852 0.1524488
## Scale 0.14238779 0.12829733 0.1624307 0.1867160
## Startup 0.07585123 0.09249243 0.0903198 0.1018499
## EducationLevelNone EducationLevelPhD EducationLevelSecondary
## Idea 0.2297022 0.3351099 0.12601443
## Scale 0.2529472 0.4128041 0.15572639
## Startup 0.1566306 0.2398475 0.08391518
## YearsExperience
## Idea 0.005557876
## Scale 0.006866322
## Startup 0.003719751
##
## Residual Deviance: 11493.07
## AIC: 11541.07
#what is the average income by age
#step1: create age categories
age_category <- cut(ent$Age,
breaks=c(18,25,35,45,55,65,75),
labels=c("18-25","26-35","36-45","46-55","56-65","66-75"),
include.lowest = TRUE)
#step2: create age category column
library(dplyr)
ent <- ent %>%
mutate(
AgeCategory=age_category
)
#step3: aggregate and group by age category
income_age <- ent %>%
group_by(AgeCategory)%>%
summarise(avg_income=mean(Annual_Income_USD,na.rm=TRUE),.groups='drop')
#step4: print results
print(income_age)
## # A tibble: 6 × 2
## AgeCategory avg_income
## <fct> <dbl>
## 1 18-25 3275.
## 2 26-35 3487.
## 3 36-45 3338.
## 4 46-55 3339.
## 5 56-65 3084.
## 6 66-75 1966.
#step5: visualize results
ggplot(data=income_age,mapping=aes(x=AgeCategory,y=avg_income))+
geom_col(color='black',fill='lightblue')+
labs(title='Average Income by Age Category')

#What percentage of entrepreneurs have received funding out of the total entrepreneurs?
#step1: calculate percentage by funded and not funded
funded_percentage <- ent %>%
group_by(FundingReceived) %>%
summarise(
count=n() # total count of entrepreneurs
)%>%
mutate(
percentage=(count/sum(count))*100
)
#step2: print results
print(funded_percentage)
## # A tibble: 2 × 3
## FundingReceived count percentage
## <int> <int> <dbl>
## 1 0 4252 85.0
## 2 1 748 15.0
#step3: visualize results
ggplot(data=funded_percentage,mapping=aes(x=FundingReceived,y=percentage))+
geom_col(color='black',fill='orange')+
labs(title='Funding Recieved Percentage')

# What is the number of employees by month?
# step1: ensure the BusinessStartDate column is in correct format
ent$BusinessStartDate <- as.Date(ent$BusinessStartDate,format = "%Y-%m-%d")
# step2: create a month column using mutate function
library(lubridate)
ent <- ent %>%
mutate(
StartMonth=month(BusinessStartDate, label=TRUE)
)
# step3: aggregate and group by business type
employee_month <- ent %>%
group_by(StartMonth)%>%
summarise(EmployeeCount=n())
# step4: print results
print(employee_month)
## # A tibble: 12 × 2
## StartMonth EmployeeCount
## <ord> <int>
## 1 Jan 409
## 2 Feb 389
## 3 Mar 436
## 4 Apr 455
## 5 May 431
## 6 Jun 421
## 7 Jul 412
## 8 Aug 443
## 9 Sep 424
## 10 Oct 405
## 11 Nov 376
## 12 Dec 399
# step5: visualize results
ggplot(data=employee_month,mapping=aes(x=StartMonth,y=EmployeeCount))+
geom_col(color='black',fill='purple')+
labs(title='Number of Employees by Month')

#What are the average years of experience for hospitality and tech in the growth stage
# step1: aggregate and group by business type and growth stage
measure <- ent %>%
filter(BusinessType %in% c('Hospitality','Tech'),
BusinessStage=='Growth')%>%
group_by(BusinessType,BusinessStage)%>%
summarise(avg_years=mean(YearsExperience,na.rm=TRUE),.groups='drop')
# step2: print results
print(measure)
## # A tibble: 2 × 3
## BusinessType BusinessStage avg_years
## <chr> <fct> <dbl>
## 1 Hospitality Growth 14.2
## 2 Tech Growth 14.1