# establish the general directory data is saved in
getwd()
## [1] "/cloud/project"
# establish the specific directory data is saved in
list.files("/cloud/project/data")
## [1] "entrepreneurs_5000.csv"
# load the data
ent <- read.csv("/cloud/project/data/entrepreneurs_5000.csv")
head(ent)
##   Entrepreneur_ID Age Gender Annual_Income_USD BusinessStartDate BusinessType
## 1       ENT100000  39   Male           2447.42        2020-09-03       Retail
## 2       ENT100001  34   Male           3649.35        2021-04-06       Retail
## 3       ENT100002  41   Male           2219.63        2013-03-24       Retail
## 4       ENT100003  49   Male           1233.31        2020-07-22       Retail
## 5       ENT100004  33   Male           4171.77        2025-05-27     Services
## 6       ENT100005  33 Female            533.88        2023-05-06         Tech
##   EducationLevel YearsExperience Employees  Region FundingReceived
## 1      Bachelors              21         1 Mbarara               0
## 2        Masters              14         6 Kampala               1
## 3        Diploma              22         1   Jinja               0
## 4      Bachelors              24         7   Other               0
## 5        Masters              11         2 Kampala               0
## 6      Secondary              12         4   Other               0
##   BusinessStage
## 1       Startup
## 2        Growth
## 3       Startup
## 4         Scale
## 5        Growth
## 6          Idea
# load packages

install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)

install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
#What are the mean, median, and standard deviation of annual income among entrepreneurs?

#step1: what is the mean of annual income
library(dplyr)
regional_income <- ent %>%
  group_by (Region) %>%
  summarise(mean_income=mean(Annual_Income_USD,na.rm=TRUE),.groups='drop')
print(regional_income)
## # A tibble: 7 × 2
##   Region  mean_income
##   <chr>         <dbl>
## 1 Entebbe       3369.
## 2 Gulu          3315.
## 3 Jinja         3350.
## 4 Kampala       3348.
## 5 Mbale         3669.
## 6 Mbarara       3353.
## 7 Other         3402.
#step2: what is teh median of annual income
med_income <- ent %>%
  group_by(Region) %>%
  summarise(median_income=median(Annual_Income_USD,na.rm=TRUE),.groups='drop')
print(med_income)
## # A tibble: 7 × 2
##   Region  median_income
##   <chr>           <dbl>
## 1 Entebbe         2555.
## 2 Gulu            2328.
## 3 Jinja           2499.
## 4 Kampala         2530.
## 5 Mbale           2558.
## 6 Mbarara         2543.
## 7 Other           2668.
#step3: what is the standard deviation of annual income
std_income <- ent %>%
 summarise(
   avg_income = mean(Annual_Income_USD),
   std_dev = sd(Annual_Income_USD),
 .groups='drop')
print(std_income)
##   avg_income  std_dev
## 1   3381.903 2944.045
#Which region has the highest average number of employees
employee_region <- ent %>%
  group_by(Region) %>%
  summarise(count=n()) %>%
  arrange(desc(count))
print(employee_region)
## # A tibble: 7 × 2
##   Region  count
##   <chr>   <int>
## 1 Kampala  2212
## 2 Other     778
## 3 Jinja     527
## 4 Mbarara   522
## 5 Mbale     402
## 6 Gulu      311
## 7 Entebbe   248
# visualize results
library(ggplot2)
ggplot(data=employee_region,mapping=aes(x=Region,y=count))+
  geom_col(fill='red', color='black')+labs(title='Employees by Region')

#What is the frequency distribution of business stages across the data
ggplot(data=ent,mapping=aes(x=BusinessStage))+
  geom_bar(color='black',fill='lightblue')+
  labs(title='Frequency Distribution of Business Stage')

#What is the relationship between Years of Experience and Annual Income?
ggplot(data=ent,mapping=aes(x=YearsExperience,y=Annual_Income_USD))+
  geom_point(alpha=0.6)+labs(title='Relationship between Years of Experience and Annual Income')

#what is the correlation between the two?
correlation <- cor(ent$YearsExperience,ent$Annual_Income_USD,use='complete.obs')
print(correlation)
## [1] -0.0105347
# is there a significant difference in Annual Income between male & female entrepreneurs.
anova_result <- aov(Annual_Income_USD~Gender,data=ent)
summary(anova_result)
##               Df    Sum Sq Mean Sq F value Pr(>F)
## Gender         3 8.971e+06 2990175   0.345  0.793
## Residuals   4996 4.332e+10 8670810
# Is there an association between Business Type and Business Stage?
# step1: create a contingency table
tbl <- table(ent$BusinessType,ent$BusinessStage)

# step2: calculate degree of association
chi_result <- chisq.test(tbl)

# step3: print results
print(chi_result)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 6.5404, df = 15, p-value = 0.9692
#Build a model to predict Annual Income using Years of Experience, Education Level, and Funding Received as predictors.

linear_model <- lm(Annual_Income_USD~YearsExperience+EducationLevel+FundingReceived,data=ent)
summary(linear_model)
## 
## Call:
## lm(formula = Annual_Income_USD ~ YearsExperience + EducationLevel + 
##     FundingReceived, data = ent)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4162.2 -1764.3  -784.2   801.5 26962.4 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3280.678     98.100  33.442   <2e-16 ***
## YearsExperience           -6.815      4.782  -1.425   0.1542    
## EducationLevelDiploma     -9.453    116.268  -0.081   0.9352    
## EducationLevelMasters    308.131    131.442   2.344   0.0191 *  
## EducationLevelNone      -254.770    200.295  -1.272   0.2034    
## EducationLevelPhD        646.205    298.168   2.167   0.0303 *  
## EducationLevelSecondary  -64.259    107.938  -0.595   0.5516    
## FundingReceived         1159.952    115.770  10.019   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2912 on 4992 degrees of freedom
## Multiple R-squared:  0.02273,    Adjusted R-squared:  0.02136 
## F-statistic: 16.59 on 7 and 4992 DF,  p-value: < 2.2e-16
#Model how business stage is influenced by Funding Received, Education Level, and Years of Experience.
#step1: convert dependent variable to factor
ent$BusinessStage <- as.factor(ent$BusinessStage)

#step2: build logistical model
library(nnet)
logistical_model <- multinom(BusinessStage~FundingReceived+EducationLevel+YearsExperience,data=ent)
## # weights:  36 (24 variable)
## initial  value 6931.471806 
## iter  10 value 5785.468307
## iter  20 value 5751.746532
## iter  30 value 5746.536437
## iter  30 value 5746.536433
## iter  30 value 5746.536433
## final  value 5746.536433 
## converged
summary(logistical_model) 
## Call:
## multinom(formula = BusinessStage ~ FundingReceived + EducationLevel + 
##     YearsExperience, data = ent)
## 
## Coefficients:
##         (Intercept) FundingReceived EducationLevelDiploma EducationLevelMasters
## Idea     -1.0562697      -0.7037304           0.090522307            0.06175435
## Scale    -1.7517005       0.7074379           0.179923929            0.09960700
## Startup   0.3064338      -0.7837916          -0.009836127           -0.02273110
##         EducationLevelNone EducationLevelPhD EducationLevelSecondary
## Idea           0.005613086         0.3953405              0.12767659
## Scale          0.384739975         0.3261484              0.12388471
## Startup       -0.193947854         0.2250051              0.08686899
##         YearsExperience
## Idea        0.001241928
## Scale      -0.007138057
## Startup     0.006055797
## 
## Std. Errors:
##         (Intercept) FundingReceived EducationLevelDiploma EducationLevelMasters
## Idea     0.11389880      0.14693907             0.1348852             0.1524488
## Scale    0.14238779      0.12829733             0.1624307             0.1867160
## Startup  0.07585123      0.09249243             0.0903198             0.1018499
##         EducationLevelNone EducationLevelPhD EducationLevelSecondary
## Idea             0.2297022         0.3351099              0.12601443
## Scale            0.2529472         0.4128041              0.15572639
## Startup          0.1566306         0.2398475              0.08391518
##         YearsExperience
## Idea        0.005557876
## Scale       0.006866322
## Startup     0.003719751
## 
## Residual Deviance: 11493.07 
## AIC: 11541.07
#what is the average income by age
#step1: create age categories
age_category <- cut(ent$Age,
                    breaks=c(18,25,35,45,55,65,75),
                    labels=c("18-25","26-35","36-45","46-55","56-65","66-75"),
                    include.lowest = TRUE)

#step2: create age category column
library(dplyr)
ent <- ent %>%
  mutate(
    AgeCategory=age_category
  )

#step3: aggregate and group by age category
income_age <- ent %>%
  group_by(AgeCategory)%>%
  summarise(avg_income=mean(Annual_Income_USD,na.rm=TRUE),.groups='drop')

#step4: print results
print(income_age)
## # A tibble: 6 × 2
##   AgeCategory avg_income
##   <fct>            <dbl>
## 1 18-25            3275.
## 2 26-35            3487.
## 3 36-45            3338.
## 4 46-55            3339.
## 5 56-65            3084.
## 6 66-75            1966.
#step5: visualize results
ggplot(data=income_age,mapping=aes(x=AgeCategory,y=avg_income))+
  geom_col(color='black',fill='lightblue')+
  labs(title='Average Income by Age Category')

#What percentage of entrepreneurs have received funding out of the total entrepreneurs?

#step1: calculate percentage by funded and not funded
funded_percentage <- ent %>%
  group_by(FundingReceived) %>%
  summarise(
    count=n() # total count of entrepreneurs
  )%>%
  mutate(
    percentage=(count/sum(count))*100
  )

#step2: print results
print(funded_percentage)
## # A tibble: 2 × 3
##   FundingReceived count percentage
##             <int> <int>      <dbl>
## 1               0  4252       85.0
## 2               1   748       15.0
#step3: visualize results
ggplot(data=funded_percentage,mapping=aes(x=FundingReceived,y=percentage))+
  geom_col(color='black',fill='orange')+
  labs(title='Funding Recieved Percentage')

# What is the number of employees by month?

# step1: ensure the BusinessStartDate column is in correct format
ent$BusinessStartDate <- as.Date(ent$BusinessStartDate,format = "%Y-%m-%d")

# step2: create a month column using mutate function
library(lubridate)
ent <- ent %>%
  mutate(
    StartMonth=month(BusinessStartDate, label=TRUE)
  ) 

# step3: aggregate and group by business type
employee_month <- ent %>%
  group_by(StartMonth)%>%
  summarise(EmployeeCount=n())

# step4: print results
print(employee_month)
## # A tibble: 12 × 2
##    StartMonth EmployeeCount
##    <ord>              <int>
##  1 Jan                  409
##  2 Feb                  389
##  3 Mar                  436
##  4 Apr                  455
##  5 May                  431
##  6 Jun                  421
##  7 Jul                  412
##  8 Aug                  443
##  9 Sep                  424
## 10 Oct                  405
## 11 Nov                  376
## 12 Dec                  399
# step5: visualize results
ggplot(data=employee_month,mapping=aes(x=StartMonth,y=EmployeeCount))+
  geom_col(color='black',fill='purple')+
  labs(title='Number of Employees by Month')

#What are the average years of experience for hospitality and tech in the growth stage

# step1: aggregate and group by business type and growth stage
measure <- ent %>%
  filter(BusinessType %in% c('Hospitality','Tech'),
         BusinessStage=='Growth')%>%
  group_by(BusinessType,BusinessStage)%>%
summarise(avg_years=mean(YearsExperience,na.rm=TRUE),.groups='drop')

# step2: print results
print(measure)
## # A tibble: 2 × 3
##   BusinessType BusinessStage avg_years
##   <chr>        <fct>             <dbl>
## 1 Hospitality  Growth             14.2
## 2 Tech         Growth             14.1