# ++EJERCICIO 5 ++

##Importando data desde un EXCEL

install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(readxl)
data_r <- read_excel("Data.xlsx", col_types = c("numeric", 
    "numeric", "numeric", "numeric", "numeric", 
    "numeric", "numeric", "numeric", "numeric", 
    "numeric", "numeric", "text", "numeric", 
    "numeric"))
## Warning: Expecting numeric in A25 / R25C1: got 'A'
#View(data_r)

#1. Verficando datos

##1.1 Reconociendo los datos

Y = NOTA FINAL(CuantContin).
X = MATEMÁTICA(CuantContin), COMUNICACIÓN(CuantContin), INGLÉS(CuantContin).
  sapply(data_r, class)
##                         MATEMATICA                       COMUNICACIÓN 
##                          "numeric"                          "numeric" 
##                             INGLÉS                               ARTE 
##                          "numeric"                          "numeric" 
##        HISTORIA_GEOGRAFÌA_ECONOMÌA         FORMACIÒN_CIUDADANA_CÌVICA 
##                          "numeric"                          "numeric" 
## PERSONA_FAMILIA_RELACIONES_HUMANAS                   EDUCACIÓN_FÍSICA 
##                          "numeric"                          "numeric" 
##                EDUCACIÓN_RELIGIOSA        CIENCIA_TECNOLOGÍA_AMBIENTE 
##                          "numeric"                          "numeric" 
##          EDUCACIÓN_PARA_EL_TRABAJO                     COMPORTAMIENTO 
##                          "numeric"                        "character" 
##                 AREÁS_DESAPROBADAS                         NOTA_FINAL 
##                          "numeric"                          "numeric"

#+2. AJUSTES DEL MODELO

#Para realizar la regresión lineal necesitamos dos funciones: # –La función lm() nos permite realizar la regresión lineal # –La función summary() nos permite visualizar los resultados de un modelo

#Primero veremos los coeficientes para ajustar el modelo.

  regresion1 <- lm(NOTA_FINAL~., data_r)
  summary(regresion1)
## Warning in summary.lm(regresion1): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = NOTA_FINAL ~ ., data = data_r)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -8.564e-15 -4.068e-16  1.330e-17  6.117e-16  1.164e-14 
## 
## Coefficients:
##                                      Estimate Std. Error    t value Pr(>|t|)
## (Intercept)                         9.105e-15  7.763e-16  1.173e+01   <2e-16
## MATEMATICA                          9.091e-02  2.107e-16  4.315e+14   <2e-16
## COMUNICACIÓN                        9.091e-02  2.232e-16  4.073e+14   <2e-16
## INGLÉS                              9.091e-02  2.300e-16  3.952e+14   <2e-16
## ARTE                                9.091e-02  1.501e-16  6.056e+14   <2e-16
## HISTORIA_GEOGRAFÌA_ECONOMÌA         9.091e-02  2.290e-16  3.970e+14   <2e-16
## FORMACIÒN_CIUDADANA_CÌVICA          9.091e-02  2.272e-16  4.002e+14   <2e-16
## PERSONA_FAMILIA_RELACIONES_HUMANAS  9.091e-02  1.691e-16  5.377e+14   <2e-16
## EDUCACIÓN_FÍSICA                    9.091e-02  2.844e-16  3.196e+14   <2e-16
## EDUCACIÓN_RELIGIOSA                 9.091e-02  2.389e-16  3.806e+14   <2e-16
## CIENCIA_TECNOLOGÍA_AMBIENTE         9.091e-02  1.673e-16  5.434e+14   <2e-16
## EDUCACIÓN_PARA_EL_TRABAJO           9.091e-02  1.087e-16  8.361e+14   <2e-16
## COMPORTAMIENTOA                    -6.592e-15  4.345e-15 -1.517e+00    0.132
## AREÁS_DESAPROBADAS                  2.149e-16  5.664e-16  3.790e-01    0.705
##                                       
## (Intercept)                        ***
## MATEMATICA                         ***
## COMUNICACIÓN                       ***
## INGLÉS                             ***
## ARTE                               ***
## HISTORIA_GEOGRAFÌA_ECONOMÌA        ***
## FORMACIÒN_CIUDADANA_CÌVICA         ***
## PERSONA_FAMILIA_RELACIONES_HUMANAS ***
## EDUCACIÓN_FÍSICA                   ***
## EDUCACIÓN_RELIGIOSA                ***
## CIENCIA_TECNOLOGÍA_AMBIENTE        ***
## EDUCACIÓN_PARA_EL_TRABAJO          ***
## COMPORTAMIENTOA                       
## AREÁS_DESAPROBADAS                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.736e-15 on 97 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 2.86e+31 on 13 and 97 DF,  p-value: < 2.2e-16

##ELIMINANDO VARIABLES QUE NO APORTAN AL MODELO##

#Las variables que se relacionan directamente a la eficiencia de programación sería el año de experiencia y el tiempo de descanso, por lo que el modelo final para la prediccion queda de la sgte manera.

  regresion2 <- lm(NOTA_FINAL~MATEMATICA+COMUNICACIÓN+INGLÉS, data_r)
  summary(regresion2)
## 
## Call:
## lm(formula = NOTA_FINAL ~ MATEMATICA + COMUNICACIÓN + INGLÉS, 
##     data = data_r)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.16414 -0.28583  0.00729  0.34307  1.05400 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.32910    0.18273   1.801   0.0745 .  
## MATEMATICA    0.31311    0.04635   6.755 7.69e-10 ***
## COMUNICACIÓN  0.34174    0.04259   8.025 1.40e-12 ***
## INGLÉS        0.38870    0.04002   9.713 2.28e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4408 on 107 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9814, Adjusted R-squared:  0.9809 
## F-statistic:  1886 on 3 and 107 DF,  p-value: < 2.2e-16

#NUEVO MODELO

#+𝒚=𝜷_𝟎+𝜷_𝟏 𝒙_1 + 𝜷_𝟐 𝒙_𝟐+𝜺+#


#3. DIAGNOSTICO DEL MÒDELO / SUPUESTOS

#3.1. Prueba de Durbin-Watson

  install.packages("lmtest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
  library(lmtest) 
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
  dwtest(regresion2)
## 
##  Durbin-Watson test
## 
## data:  regresion2
## DW = 1.8032, p-value = 0.1354
## alternative hypothesis: true autocorrelation is greater than 0

#DW = 1.8: Analizando el resultado de Durbin Watson(DW), SE CONCLUYE QUE LAS VARIABLES NO PRESENTAN AUTOCORRELACIÒN YA QUE LOS RESULTADOS ESTAN CERCA A 2, POR LO QUE SE PUEDE ASUMIR INDEPENDENCIAS ENTRE LOS RESIDUOS DE LAS VARIABLES.

#3.2 Normalidad Multivariada

  par(mfrow=c(1,5)) 
  for(i in 1:5) { 
    boxplot(data_r[i], main=names(data_r)[i])
  }

Análisis de correlación entre dos variables

  plot(data_r$NOTA_FINAL, data_r$MATEMATICA)

  plot(data_r$NOTA_FINAL, data_r$COMUNICACIÓN)

install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(ggplot2)

ggplot(data_r, aes(MATEMATICA, NOTA_FINAL)) + 
  geom_point()  +
  stat_smooth(method = "lm", col = "dodgerblue3") +
  labs(title = "Regresion lineal simple",
    x = "X : variable independiente",
    y = "Y : variable dependiente",
    caption = 'Fuente: elaboracion propia' ) +
  theme_gray()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

#Correlaciòn de multiples variables

install.packages('GGally')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data_r)
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removing 1 row that contained a missing value
## Warning: Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#3.2 Prueba de Shapiro Test

 residua.1<-residuals(regresion2);
  shapiro.test(residua.1)
## 
##  Shapiro-Wilk normality test
## 
## data:  residua.1
## W = 0.99032, p-value = 0.6195

###p-value = 0.6195: Si el SIG. > 0.05 ENTONCES LAS variables presentan normalidad. Si el Sig < 0,05 los datos no presentan normalidad

#3.3 Multicolinealidad VIF

  install.packages('car')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
  library(car)
## Loading required package: carData
  vif(regresion2)
##   MATEMATICA COMUNICACIÓN       INGLÉS 
##    11.870417     8.548336     9.721875

#Como el valor del VIF, Escede a 5 se puede concluir que las variables están altamente correlacionados, por lo que podría afectar al resultado final a la predicción, por lo que sería necesario hacer un analisis exaustivo.

#+4.PREDICCIÓN DEL MODELO+

#Mostrando el coeficiente para la predicciòn

  regresion2$coefficients
##  (Intercept)   MATEMATICA COMUNICACIÓN       INGLÉS 
##    0.3290967    0.3131074    0.3417402    0.3887007

#VEMOS QUE EL MÓDELO QUEDARIA DE LA SIGUIENTE MANERA 〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️ ## y= 0.3290967 + 0.3131074x1 + 0.3417402x2 + 0.3887007 ☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️️️️️️️☝️☝️

#Para realizar la predicción necesitamos dos funciones:

#- La función data.frame para generar una tabla de datos de la variable X.

#- La función predict para realizar la predicción.

  data_prediccion <- data.frame(MATEMATICA=c(08), COMUNICACIÓN=c(10),INGLÉS=c(20))
  predict(regresion2, data_prediccion)
##        1 
## 14.02537