# ++EJERCICIO 5 ++

##Importando data desde un EXCEL

install.packages("readxl")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(readxl)
data_r <- read_excel("data_tratada.xlsx", 
    col_types = c("numeric", "numeric", "numeric", 
        "text", "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric"))
#View(data_r)

#1. VERIFICANDO DATOS

##1.1 RECONOCIMIENTO DE DATOS

Y = NOTA FINAL(CuantContin).
X = CANTIDAD DE FALTAS(Cuantitativa Discreta), CANTIDAD DE ASISTENCIA(Cuantitativa Discreta), SEXO(Cualitativa Dicotómica).
  sapply(data_r, class)
##                    edad    cantidad_asistencias         cantidad_faltas 
##               "numeric"               "numeric"               "numeric" 
##                    sexo              matematica            comunicación 
##             "character"               "numeric"               "numeric" 
##                  ingles                    arte                     HGE 
##               "numeric"               "numeric"               "numeric" 
##                     FCC                    PFRH                      EF 
##               "numeric"               "numeric"               "numeric" 
##                      ER                     CTA                     EPT 
##               "numeric"               "numeric"               "numeric" 
##                      CP cant_areas_desaprobadas              nota_final 
##               "numeric"               "numeric"               "numeric"

#+2. AJUSTES DEL MODELO

#Para realizar la regresión lineal necesitamos dos funciones: # –La función lm() nos permite realizar la regresión lineal # –La función summary() nos permite visualizar los resultados de un modelo

#Primero veremos los coeficientes para ajustar el modelo.

  regresion1 <- lm(nota_final~cantidad_asistencias+cantidad_faltas+edad+sexo, data_r)
  summary(regresion1)
## 
## Call:
## lm(formula = nota_final ~ cantidad_asistencias + cantidad_faltas + 
##     edad + sexo, data = data_r)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.18693 -0.07777  0.01449  0.11403  0.63398 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           9.945134   1.238444   8.030 6.47e-15 ***
## cantidad_asistencias  0.064269   0.008690   7.396 5.62e-13 ***
## cantidad_faltas      -0.064897   0.008651  -7.502 2.72e-13 ***
## edad                  0.009882   0.005612   1.761   0.0788 .  
## sexoM                 0.018975   0.019648   0.966   0.3346    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.221 on 523 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9799, Adjusted R-squared:  0.9798 
## F-statistic:  6388 on 4 and 523 DF,  p-value: < 2.2e-16

##ELIMINANDO VARIABLES QUE NO APORTAN AL MODELO##

#Las variables que se relacionan directamente a la eficiencia de programación sería el año de experiencia y el tiempo de descanso, por lo que el modelo final para la prediccion queda de la sgte manera.

  regresion2 <- lm(nota_final~cantidad_asistencias+cantidad_faltas+edad, data_r)
  summary(regresion2)
## 
## Call:
## lm(formula = nota_final ~ cantidad_asistencias + cantidad_faltas + 
##     edad, data = data_r)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.19323 -0.08128  0.01825  0.11152  0.62612 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          10.006247   1.236747   8.091 4.15e-15 ***
## cantidad_asistencias  0.063892   0.008680   7.361 7.13e-13 ***
## cantidad_faltas      -0.065346   0.008638  -7.565 1.75e-13 ***
## edad                  0.010137   0.005605   1.808   0.0711 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.221 on 524 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.9799, Adjusted R-squared:  0.9798 
## F-statistic:  8518 on 3 and 524 DF,  p-value: < 2.2e-16

#NUEVO MODELO

#+𝒚=𝜷_𝟎+𝜷_𝟏 𝒙_1 + 𝜷_𝟐 𝒙_𝟐+𝜺+#


#3. DIAGNOSTICO DEL MÒDELO / SUPUESTOS

#3.1. Prueba de Durbin-Watson

  install.packages("lmtest")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
  library(lmtest) 
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
  dwtest(regresion2)
## 
##  Durbin-Watson test
## 
## data:  regresion2
## DW = 1.9522, p-value = 0.2768
## alternative hypothesis: true autocorrelation is greater than 0

#DW = 1.8: Analizando el resultado de Durbin Watson(DW), SE CONCLUYE QUE LAS VARIABLES NO PRESENTAN AUTOCORRELACIÒN YA QUE LOS RESULTADOS ESTAN CERCA A 2, POR LO QUE SE PUEDE ASUMIR INDEPENDENCIAS ENTRE LOS RESIDUOS DE LAS VARIABLES.


## Análisis de correlación entre dos variables

```r
  plot(data_r$nota_final, data_r$cantidad_asistencias)

  plot(data_r$nota_final, data_r$cantidad_faltas)

  plot(data_r$nota_final, data_r$edad)

#Correlaciòn de multiples variables

install.packages('GGally')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(data_r)
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 3 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).

###p-value = 0.6195: Si el SIG. > 0.05 ENTONCES LAS variables presentan normalidad. Si el Sig < 0,05 los datos no presentan normalidad

install.packages('nortest')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(nortest)
residua.1<-residuals(regresion2);
lillie.test(residua.1)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residua.1
## D = 0.11845, p-value < 2.2e-16

#3.3 Multicolinealidad VIF

  install.packages('car')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
  library(car)
## Loading required package: carData
  vif(regresion2)
## cantidad_asistencias      cantidad_faltas                 edad 
##            115.06163            115.14985              1.00824

#Como el valor del VIF, Escede a 5 se puede concluir que las variables están altamente correlacionados, por lo que podría afectar al resultado final a la predicción, por lo que sería necesario hacer un analisis exaustivo.

#+4.PREDICCIÓN DEL MODELO+

#Mostrando el coeficiente para la predicciòn

  regresion2$coefficients
##          (Intercept) cantidad_asistencias      cantidad_faltas 
##          10.00624677           0.06389193          -0.06534630 
##                 edad 
##           0.01013650

#VEMOS QUE EL MÓDELO QUEDARIA DE LA SIGUIENTE MANERA 〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️〽️ ## y= 10.00624677 + 0.06389193x1 + -0.06534630x2 + 0.01013650x3 ☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️☝️️️️️️️☝️☝️

#Para realizar la predicción necesitamos dos funciones:

#- La función data.frame para generar una tabla de datos de la variable X.

#- La función predict para realizar la predicción.

  data_prediccion <- data.frame(cantidad_asistencias=c(130), cantidad_faltas=c(10),edad=c(14))
  predict(regresion2, data_prediccion)
##        1 
## 17.80065