#loading libraries
library(readr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
#loading data
df<-read_csv("data.csv")
## Rows: 200 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#exploratory data analysis
describe(df)
## df
##
## 4 Variables 200 Observations
## --------------------------------------------------------------------------------
## ID
## n missing distinct Info Mean Gmd .05 .10
## 200 0 200 1 100.5 67 10.95 20.90
## .25 .50 .75 .90 .95
## 50.75 100.50 150.25 180.10 190.05
##
## lowest : 1 2 3 4 5, highest: 196 197 198 199 200
## --------------------------------------------------------------------------------
## ARR
## n missing distinct Info Mean Gmd .05 .10
## 200 0 176 1 117494 153553 5402 7363
## .25 .50 .75 .90 .95
## 24000 50591 104093 241090 398636
##
## lowest : 1800 2542 2994 3250 3564
## highest: 750000 1013879 1468767 1872750 1923183
## --------------------------------------------------------------------------------
## Employees
## n missing distinct Info Mean Gmd .05 .10
## 200 0 156 1 13278 23641 11.90 25.00
## .25 .50 .75 .90 .95
## 90.75 563.00 3839.50 27950.00 57374.20
##
## lowest : 5 6 7 10 12, highest: 103000 135301 181000 292067 624000
## --------------------------------------------------------------------------------
## Revenue
## n missing distinct Info Mean Gmd .05 .10
## 200 0 173 1 6.198e+09 1.123e+10 2.108e+06 3.738e+06
## .25 .50 .75 .90 .95
## 1.360e+07 9.505e+07 1.685e+09 1.280e+10 3.463e+10
##
## lowest : 1.10000e+05 6.10000e+05 9.20000e+05 1.00000e+06 1.10000e+06
## highest: 6.34870e+10 8.01180e+10 1.17929e+11 1.82000e+11 1.84903e+11
##
## 0 (146, 0.730), 2e+09 (12, 0.060), 4e+09 (10, 0.050), 6e+09 (6, 0.030), 1e+10
## (2, 0.010), 1.2e+10 (4, 0.020), 1.4e+10 (1, 0.005), 1.6e+10 (5, 0.025), 2.2e+10
## (1, 0.005), 2.4e+10 (1, 0.005), 3.4e+10 (2, 0.010), 3.6e+10 (1, 0.005), 4e+10
## (2, 0.010), 4.8e+10 (1, 0.005), 5.4e+10 (1, 0.005), 6.4e+10 (1, 0.005), 8e+10
## (1, 0.005), 1.18e+11 (1, 0.005), 1.82e+11 (1, 0.005), 1.84e+11 (1, 0.005)
##
## For the frequency table, variable is rounded to the nearest 2e+09
## --------------------------------------------------------------------------------
summary(df)
## ID ARR Employees Revenue
## Min. : 1.00 Min. : 1800 Min. : 5.0 Min. :1.100e+05
## 1st Qu.: 50.75 1st Qu.: 24000 1st Qu.: 90.8 1st Qu.:1.360e+07
## Median :100.50 Median : 50591 Median : 563.0 Median :9.505e+07
## Mean :100.50 Mean : 117494 Mean : 13278.0 Mean :6.198e+09
## 3rd Qu.:150.25 3rd Qu.: 104093 3rd Qu.: 3839.5 3rd Qu.:1.685e+09
## Max. :200.00 Max. :1923183 Max. :624000.0 Max. :1.849e+11
#modeling
model1<-lm(ARR ~ Revenue + Employees, data = df)
model2<-lm(ARR ~ Revenue, data = df)
model3<-lm(ARR ~ Employees, data = df)
#analysis
summary(model1)
##
## Call:
## lm(formula = ARR ~ Revenue + Employees, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -680151 -74768 -43666 -2439 1714672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.752e+04 1.619e+04 5.404 1.87e-07 ***
## Revenue 4.730e-06 8.007e-07 5.907 1.51e-08 ***
## Employees 5.007e-02 3.379e-01 0.148 0.882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 219000 on 197 degrees of freedom
## Multiple R-squared: 0.1951, Adjusted R-squared: 0.1869
## F-statistic: 23.87 on 2 and 197 DF, p-value: 5.207e-10
summary(model2)
##
## Call:
## lm(formula = ARR ~ Revenue, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -682363 -74835 -43922 -2703 1715726
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.781e+04 1.603e+04 5.478 1.30e-07 ***
## Revenue 4.789e-06 6.915e-07 6.925 5.94e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 218500 on 198 degrees of freedom
## Multiple R-squared: 0.195, Adjusted R-squared: 0.1909
## F-statistic: 47.96 on 1 and 198 DF, p-value: 5.939e-11
summary(model3)
##
## Call:
## lm(formula = ARR ~ Employees, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -596375 -85933 -55260 -13277 1760249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.036e+05 1.728e+04 5.994 9.51e-09 ***
## Employees 1.049e+00 3.166e-01 3.313 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 237000 on 198 degrees of freedom
## Multiple R-squared: 0.05251, Adjusted R-squared: 0.04772
## F-statistic: 10.97 on 1 and 198 DF, p-value: 0.001099
AIC(model1)
## [1] 5491.272
AIC(model2)
## [1] 5489.295
AIC(model3)
## [1] 5521.887
BIC(model1)
## [1] 5504.466
BIC(model2)
## [1] 5499.19
BIC(model3)
## [1] 5531.782
#visualizing revenue vs. employees
plot(df$Employees, df$Revenue)
Employees and Revenue seem highly correlated and are causing multi-collinearity when used to predict the ARR. Employees as such is not a very statistically significant predictor variable by itself, where as Revenue is. Also, model2 is the best model in terms of Adjusted R-squared, AIC, and BIC compared to both model1 and model3. Hence, model2 is the one I am choosing to provide a conclusion to this analysis.