##Importando data desde un EXCEL
install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(readxl)
data_r <- read_excel("Data.xlsx", col_types = c("numeric",
"numeric", "numeric", "numeric", "numeric",
"numeric", "numeric", "numeric", "numeric",
"numeric", "numeric", "text", "numeric",
"numeric"))
## Warning: Expecting numeric in A25 / R25C1: got 'A'
#View(data_r)
##1.1 Reconociendo los datos
sapply(data_r, class)
## MATEMATICA COMUNICACIÓN
## "numeric" "numeric"
## INGLÉS ARTE
## "numeric" "numeric"
## HISTORIA_GEOGRAFÌA_ECONOMÌA FORMACIÒN_CIUDADANA_CÌVICA
## "numeric" "numeric"
## PERSONA_FAMILIA_RELACIONES_HUMANAS EDUCACIÓN_FÍSICA
## "numeric" "numeric"
## EDUCACIÓN_RELIGIOSA CIENCIA_TECNOLOGÍA_AMBIENTE
## "numeric" "numeric"
## EDUCACIÓN_PARA_EL_TRABAJO COMPORTAMIENTO
## "numeric" "character"
## AREÁS_DESAPROBADAS NOTA_FINAL
## "numeric" "numeric"
#Para realizar la regresión lineal necesitamos dos funciones: # –La
función lm() nos permite realizar la regresión lineal # –La
función summary() nos permite visualizar los resultados de
un modelo
#Primero veremos los coeficientes para ajustar el modelo.
regresion1 <- lm(NOTA_FINAL~., data_r)
summary(regresion1)
## Warning in summary.lm(regresion1): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = NOTA_FINAL ~ ., data = data_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.564e-15 -4.068e-16 1.330e-17 6.117e-16 1.164e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.105e-15 7.763e-16 1.173e+01 <2e-16
## MATEMATICA 9.091e-02 2.107e-16 4.315e+14 <2e-16
## COMUNICACIÓN 9.091e-02 2.232e-16 4.073e+14 <2e-16
## INGLÉS 9.091e-02 2.300e-16 3.952e+14 <2e-16
## ARTE 9.091e-02 1.501e-16 6.056e+14 <2e-16
## HISTORIA_GEOGRAFÌA_ECONOMÌA 9.091e-02 2.290e-16 3.970e+14 <2e-16
## FORMACIÒN_CIUDADANA_CÌVICA 9.091e-02 2.272e-16 4.002e+14 <2e-16
## PERSONA_FAMILIA_RELACIONES_HUMANAS 9.091e-02 1.691e-16 5.377e+14 <2e-16
## EDUCACIÓN_FÍSICA 9.091e-02 2.844e-16 3.196e+14 <2e-16
## EDUCACIÓN_RELIGIOSA 9.091e-02 2.389e-16 3.806e+14 <2e-16
## CIENCIA_TECNOLOGÍA_AMBIENTE 9.091e-02 1.673e-16 5.434e+14 <2e-16
## EDUCACIÓN_PARA_EL_TRABAJO 9.091e-02 1.087e-16 8.361e+14 <2e-16
## COMPORTAMIENTOA -6.592e-15 4.345e-15 -1.517e+00 0.132
## AREÁS_DESAPROBADAS 2.149e-16 5.664e-16 3.790e-01 0.705
##
## (Intercept) ***
## MATEMATICA ***
## COMUNICACIÓN ***
## INGLÉS ***
## ARTE ***
## HISTORIA_GEOGRAFÌA_ECONOMÌA ***
## FORMACIÒN_CIUDADANA_CÌVICA ***
## PERSONA_FAMILIA_RELACIONES_HUMANAS ***
## EDUCACIÓN_FÍSICA ***
## EDUCACIÓN_RELIGIOSA ***
## CIENCIA_TECNOLOGÍA_AMBIENTE ***
## EDUCACIÓN_PARA_EL_TRABAJO ***
## COMPORTAMIENTOA
## AREÁS_DESAPROBADAS
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.736e-15 on 97 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.86e+31 on 13 and 97 DF, p-value: < 2.2e-16
##ELIMINANDO VARIABLES QUE NO APORTAN AL MODELO##
#Las variables que se relacionan directamente a la eficiencia de programación sería el año de experiencia y el tiempo de descanso, por lo que el modelo final para la prediccion queda de la sgte manera.
regresion2 <- lm(NOTA_FINAL~MATEMATICA+COMUNICACIÓN+INGLÉS, data_r)
summary(regresion2)
##
## Call:
## lm(formula = NOTA_FINAL ~ MATEMATICA + COMUNICACIÓN + INGLÉS,
## data = data_r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.16414 -0.28583 0.00729 0.34307 1.05400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32910 0.18273 1.801 0.0745 .
## MATEMATICA 0.31311 0.04635 6.755 7.69e-10 ***
## COMUNICACIÓN 0.34174 0.04259 8.025 1.40e-12 ***
## INGLÉS 0.38870 0.04002 9.713 2.28e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4408 on 107 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9814, Adjusted R-squared: 0.9809
## F-statistic: 1886 on 3 and 107 DF, p-value: < 2.2e-16
#NUEVO MODELO
#+𝒚=𝜷_𝟎+𝜷_𝟏 𝒙_1 + 𝜷_𝟐 𝒙_𝟐+𝜺+#
#3.1. Prueba de Durbin-Watson
install.packages("lmtest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(regresion2)
##
## Durbin-Watson test
##
## data: regresion2
## DW = 1.8032, p-value = 0.1354
## alternative hypothesis: true autocorrelation is greater than 0
#DW = 1.8: Analizando el resultado de Durbin Watson(DW), SE CONCLUYE QUE LAS VARIABLES NO PRESENTAN AUTOCORRELACIÒN YA QUE LOS RESULTADOS ESTAN CERCA A 2, POR LO QUE SE PUEDE ASUMIR INDEPENDENCIAS ENTRE LOS RESIDUOS DE LAS VARIABLES.
#3.2 Normalidad Multivariada
par(mfrow=c(1,5))
for(i in 1:5) {
boxplot(data_r[i], main=names(data_r)[i])
}
plot(data_r$NOTA_FINAL, data_r$MATEMATICA)
plot(data_r$NOTA_FINAL, data_r$COMUNICACIÓN)
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(ggplot2)
ggplot(data_r, aes(MATEMATICA, NOTA_FINAL)) +
geom_point() +
stat_smooth(method = "lm", col = "dodgerblue3") +
labs(title = "Regresion lineal simple",
x = "X : variable independiente",
y = "Y : variable dependiente",
caption = 'Fuente: elaboracion propia' ) +
theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
#Correlaciòn de multiples variables
install.packages('GGally')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data_r)
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#3.2 Prueba de Shapiro Test
residua.1<-residuals(regresion2);
shapiro.test(residua.1)
##
## Shapiro-Wilk normality test
##
## data: residua.1
## W = 0.99032, p-value = 0.6195
###p-value = 0.6195: Si el SIG. > 0.05 ENTONCES LAS variables presentan normalidad. Si el Sig < 0,05 los datos no presentan normalidad
#3.3 Multicolinealidad VIF
install.packages('car')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(car)
## Loading required package: carData
vif(regresion2)
## MATEMATICA COMUNICACIÓN INGLÉS
## 11.870417 8.548336 9.721875
#Como el valor del VIF, Escede a 5 se puede concluir que las variables están altamente correlacionados, por lo que podría afectar al resultado final a la predicción, por lo que sería necesario hacer un analisis exaustivo.
#Mostrando el coeficiente para la predicciòn
regresion2$coefficients
## (Intercept) MATEMATICA COMUNICACIÓN INGLÉS
## 0.3290967 0.3131074 0.3417402 0.3887007
#VEMOS QUE EL MÓDELO QUEDARIA DE LA SIGUIENTE MANERA 〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️ ## y= 0.3290967 + 0.3131074x1 + 0.3417402x2 + 0.3887007 ☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️️️️️️️☝️☝️
#Para realizar la predicción necesitamos dos funciones:
data.frame para generar una tabla de
datos de la variable X.predict para realizar la predicción. data_prediccion <- data.frame(MATEMATICA=c(08), COMUNICACIÓN=c(10),INGLÉS=c(20))
predict(regresion2, data_prediccion)
## 1
## 14.02537