##Importando data desde un EXCEL
install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(readxl)
data_r <- read_excel("data_tratada.xlsx",
col_types = c("numeric", "numeric", "numeric",
"text", "numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric"))
#View(data_r)
##1.1 RECONOCIMIENTO DE DATOS
sapply(data_r, class)
## edad cantidad_asistencias cantidad_faltas
## "numeric" "numeric" "numeric"
## sexo matematica comunicación
## "character" "numeric" "numeric"
## ingles arte HGE
## "numeric" "numeric" "numeric"
## FCC PFRH EF
## "numeric" "numeric" "numeric"
## ER CTA EPT
## "numeric" "numeric" "numeric"
## CP cant_areas_desaprobadas nota_final
## "numeric" "numeric" "numeric"
#Para realizar la regresión lineal necesitamos dos funciones: # –La
función lm() nos permite realizar la regresión lineal # –La
función summary() nos permite visualizar los resultados de
un modelo
#Primero veremos los coeficientes para ajustar el modelo.
regresion1 <- lm(nota_final~cantidad_asistencias+cantidad_faltas+edad+sexo, data_r)
summary(regresion1)
##
## Call:
## lm(formula = nota_final ~ cantidad_asistencias + cantidad_faltas +
## edad + sexo, data = data_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.18693 -0.07777 0.01449 0.11403 0.63398
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.945134 1.238444 8.030 6.47e-15 ***
## cantidad_asistencias 0.064269 0.008690 7.396 5.62e-13 ***
## cantidad_faltas -0.064897 0.008651 -7.502 2.72e-13 ***
## edad 0.009882 0.005612 1.761 0.0788 .
## sexoM 0.018975 0.019648 0.966 0.3346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.221 on 523 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9799, Adjusted R-squared: 0.9798
## F-statistic: 6388 on 4 and 523 DF, p-value: < 2.2e-16
##ELIMINANDO VARIABLES QUE NO APORTAN AL MODELO##
#Las variables que se relacionan directamente a la eficiencia de programación sería el año de experiencia y el tiempo de descanso, por lo que el modelo final para la prediccion queda de la sgte manera.
regresion2 <- lm(nota_final~cantidad_asistencias+cantidad_faltas+edad, data_r)
summary(regresion2)
##
## Call:
## lm(formula = nota_final ~ cantidad_asistencias + cantidad_faltas +
## edad, data = data_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.19323 -0.08128 0.01825 0.11152 0.62612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.006247 1.236747 8.091 4.15e-15 ***
## cantidad_asistencias 0.063892 0.008680 7.361 7.13e-13 ***
## cantidad_faltas -0.065346 0.008638 -7.565 1.75e-13 ***
## edad 0.010137 0.005605 1.808 0.0711 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.221 on 524 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.9799, Adjusted R-squared: 0.9798
## F-statistic: 8518 on 3 and 524 DF, p-value: < 2.2e-16
#NUEVO MODELO
#+𝒚=𝜷_𝟎+𝜷_𝟏 𝒙_1 + 𝜷_𝟐 𝒙_𝟐+𝜺+#
#3.1. Prueba de Durbin-Watson
install.packages("lmtest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(regresion2)
##
## Durbin-Watson test
##
## data: regresion2
## DW = 1.9522, p-value = 0.2768
## alternative hypothesis: true autocorrelation is greater than 0
#DW = 1.8: Analizando el resultado de Durbin Watson(DW), SE CONCLUYE QUE LAS VARIABLES NO PRESENTAN AUTOCORRELACIÒN YA QUE LOS RESULTADOS ESTAN CERCA A 2, POR LO QUE SE PUEDE ASUMIR INDEPENDENCIAS ENTRE LOS RESIDUOS DE LAS VARIABLES.
## Análisis de correlación entre dos variables
```r
plot(data_r$nota_final, data_r$cantidad_asistencias)
plot(data_r$nota_final, data_r$cantidad_faltas)
plot(data_r$nota_final, data_r$edad)
#Correlaciòn de multiples variables
install.packages('GGally')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data_r)
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
###p-value = 0.6195: Si el SIG. > 0.05 ENTONCES LAS variables presentan normalidad. Si el Sig < 0,05 los datos no presentan normalidad
install.packages('nortest')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(nortest)
residua.1<-residuals(regresion2);
lillie.test(residua.1)
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: residua.1
## D = 0.11845, p-value < 2.2e-16
#3.3 Multicolinealidad VIF
install.packages('car')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(car)
## Loading required package: carData
vif(regresion2)
## cantidad_asistencias cantidad_faltas edad
## 115.06163 115.14985 1.00824
#Como el valor del VIF, Escede a 5 se puede concluir que las variables están altamente correlacionados, por lo que podría afectar al resultado final a la predicción, por lo que sería necesario hacer un analisis exaustivo.
#Mostrando el coeficiente para la predicciòn
regresion2$coefficients
## (Intercept) cantidad_asistencias cantidad_faltas
## 10.00624677 0.06389193 -0.06534630
## edad
## 0.01013650
#VEMOS QUE EL MÓDELO QUEDARIA DE LA SIGUIENTE MANERA 〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️ ## y= 10.00624677 + 0.06389193x1 + -0.06534630x2 + 0.01013650x3 ☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️️️️️️️☝️☝️
#Para realizar la predicción necesitamos dos funciones:
data.frame para generar una tabla de
datos de la variable X.predict para realizar la predicción. data_prediccion <- data.frame(cantidad_asistencias=c(130), cantidad_faltas=c(10),edad=c(14))
predict(regresion2, data_prediccion)
## 1
## 17.80065