Introducción

Se definirá un modelo de regresión lineal simple para predecir los salarios anuales (variable dependiente) de científicos de datos (medido en usd) con base en los años de experiencia laboral (variable independiente).

Este documento es una adaptación de los materiales del curso que el instructor Mo Rebaie imparte en la plataforma COURSERA cuya consulta se hizo en diciembre de 2020.

rm(list = ls())
library(caTools)
library(scales)
library(ggplot2)
library(dplyr)
library(equatiomatic)
library(prettydoc)
# Importing the dataset
dataset = read.csv('Predicting_Salaries.csv')
str(dataset)
## 'data.frame':    100 obs. of  2 variables:
##  $ YearsOfExperience: num  0.1 0.1 0.3 0.4 0.5 0.5 0.5 0.7 0.9 1 ...
##  $ AnnualSalary     : int  30100 32209 31880 33342 32308 33417 34907 36765 37311 37006 ...
names(dataset) <- c('experience','salary')
summary(dataset)
##    experience         salary      
##  Min.   : 0.100   Min.   : 30100  
##  1st Qu.: 2.625   1st Qu.: 48337  
##  Median : 5.800   Median : 81008  
##  Mean   : 5.775   Mean   : 80519  
##  3rd Qu.: 8.850   3rd Qu.:109815  
##  Max.   :12.000   Max.   :141230
sapply(dataset, function(x) sum(is.na(x)))
## experience     salary 
##          0          0
ggplot(data=dataset, aes(x=salary)) +
geom_histogram(fill="deepskyblue2",binwidth=5*10^3) +
  ggtitle ('Salarios anuales de científicos de datos (usd)') +
  xlab ('USD anuales')+
  scale_x_continuous(limits = c(3*10^4, 145*10^3))+
  theme_minimal()
## Warning: Removed 2 rows containing missing values (geom_bar).
Gráfico 1

Gráfico 1

En el histograma claramente observamos dos distribuciones separadas que medianamente se aproximan a la forma de campana.

# Splitting the dataset into the Training set and Test set
set.seed(90909)
split = sample.split(dataset$salary, SplitRatio = 3/4)
training_set = subset(dataset, split == TRUE)
testing_set = subset(dataset, split == FALSE)
# Fitting Simple Linear Regression to the Training set
linearregressor = lm(formula = salary ~ experience, data = training_set)
summary(linearregressor)
## 
## Call:
## lm(formula = salary ~ experience, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9884.7 -4015.7   708.8  2670.4 10731.0 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    26446       1076   24.59   <2e-16 ***
## experience      9346        151   61.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4677 on 73 degrees of freedom
## Multiple R-squared:  0.9813, Adjusted R-squared:  0.981 
## F-statistic:  3831 on 1 and 73 DF,  p-value: < 2.2e-16
extract_eq(linearregressor)

\[ \operatorname{salary} = \alpha + \beta_{1}(\operatorname{experience}) + \epsilon \]

extract_eq(linearregressor, use_coefs=TRUE)

\[ \operatorname{salary} = 26446.02 + 9345.73(\operatorname{experience}) + \epsilon \]

El modelo indica que por cada año adicional de experiencia laboral que los científicos de datos tengan, en promedio sus ingresos se incrementarán en 9,345.74 usd lo que equivale aproximadamente a 186,900 mxp.

# Predicting the Test set results
Y_pred = predict(linearregressor, newdata= testing_set)
summary(Y_pred)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27381   39530   64764   69623   98408  132053

Graficando el modelo

# Visualising the Training set results
ggplot() +
  geom_point(aes(x=training_set$experience, y=training_set$salary),
             colour = 'darkgreen') +
  geom_line (aes( x= training_set$experience, y=predict(linearregressor, newdata= training_set)),
             colour = 'navy') +
  ggtitle ('Annual Salaries of Data Scientists vs Experience in Years (Training Set)') +
  xlab ('Years of Experience') +
  ylab ('Annual Salary') +
  scale_x_continuous(limits = c(0, 12)) + 
  scale_y_continuous(limits = c(0, 150000))+
  theme_minimal()
Gráfico 2

Gráfico 2

# Visualising the Test set results
ggplot() +
  geom_point(aes(x=testing_set$experience, y=testing_set$salary),
             colour = 'blueviolet') +
  geom_line (aes( x= training_set$experience, y=predict(linearregressor, newdata= training_set)),
             colour = 'navy') +
  ggtitle ('Annual Salaries of Data Scientists vs Experience in Years (Test Set)') +
  xlab ('Years of Experience') +
  ylab ('Annual Salary')+
  scale_x_continuous(limits = c(0, 12)) + 
  scale_y_continuous(limits = c(0, 150000))+
  theme_minimal()
Gráfico 3

Gráfico 3