Regresión linear con R

library(dplyr)
library(ggplot2)
library(ggthemes)
library(corrgram)
library(corrplot)
library(caTools)
setwd("~/R/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Machine Learning with R")
df <- read.csv("student-mat.csv", sep=";")
head(df)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob
## 1     GP   F  18       U     GT3       A    4    4  at_home  teacher
## 2     GP   F  17       U     GT3       T    1    1  at_home    other
## 3     GP   F  15       U     LE3       T    1    1  at_home    other
## 4     GP   F  15       U     GT3       T    4    2   health services
## 5     GP   F  16       U     GT3       T    3    3    other    other
## 6     GP   M  16       U     LE3       T    4    3 services    other
##       reason guardian traveltime studytime failures schoolsup famsup paid
## 1     course   mother          2         2        0       yes     no   no
## 2     course   father          1         2        0        no    yes   no
## 3      other   mother          1         2        3       yes     no  yes
## 4       home   mother          1         3        0        no    yes  yes
## 5       home   father          1         2        0        no    yes  yes
## 6 reputation   mother          1         2        0        no    yes  yes
##   activities nursery higher internet romantic famrel freetime goout Dalc
## 1         no     yes    yes       no       no      4        3     4    1
## 2         no      no    yes      yes       no      5        3     3    1
## 3         no     yes    yes      yes       no      4        3     2    2
## 4        yes     yes    yes      yes      yes      3        2     2    1
## 5         no     yes    yes       no       no      4        3     2    1
## 6        yes     yes    yes      yes       no      5        4     2    1
##   Walc health absences G1 G2 G3
## 1    1      3        6  5  6  6
## 2    1      3        4  5  5  6
## 3    3      3       10  7  8 10
## 4    1      5        2 15 14 15
## 5    2      5        4  6 10 10
## 6    2      5       10 15 15 15
summary(df)
##  school   sex          age       address famsize   Pstatus      Medu      
##  GP:349   F:208   Min.   :15.0   R: 88   GT3:281   A: 41   Min.   :0.000  
##  MS: 46   M:187   1st Qu.:16.0   U:307   LE3:114   T:354   1st Qu.:2.000  
##                   Median :17.0                             Median :3.000  
##                   Mean   :16.7                             Mean   :2.749  
##                   3rd Qu.:18.0                             3rd Qu.:4.000  
##                   Max.   :22.0                             Max.   :4.000  
##       Fedu             Mjob           Fjob            reason   
##  Min.   :0.000   at_home : 59   at_home : 20   course    :145  
##  1st Qu.:2.000   health  : 34   health  : 18   home      :109  
##  Median :2.000   other   :141   other   :217   other     : 36  
##  Mean   :2.522   services:103   services:111   reputation:105  
##  3rd Qu.:3.000   teacher : 58   teacher : 29                   
##  Max.   :4.000                                                 
##    guardian     traveltime      studytime        failures      schoolsup
##  father: 90   Min.   :1.000   Min.   :1.000   Min.   :0.0000   no :344  
##  mother:273   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   yes: 51  
##  other : 32   Median :1.000   Median :2.000   Median :0.0000            
##               Mean   :1.448   Mean   :2.035   Mean   :0.3342            
##               3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000            
##               Max.   :4.000   Max.   :4.000   Max.   :3.0000            
##  famsup     paid     activities nursery   higher    internet  romantic 
##  no :153   no :214   no :194    no : 81   no : 20   no : 66   no :263  
##  yes:242   yes:181   yes:201    yes:314   yes:375   yes:329   yes:132  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##      famrel         freetime         goout            Dalc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :4.000   Median :3.000   Median :3.000   Median :1.000  
##  Mean   :3.944   Mean   :3.235   Mean   :3.109   Mean   :1.481  
##  3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##       Walc           health         absences            G1       
##  Min.   :1.000   Min.   :1.000   Min.   : 0.000   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00  
##  Median :2.000   Median :4.000   Median : 4.000   Median :11.00  
##  Mean   :2.291   Mean   :3.554   Mean   : 5.709   Mean   :10.91  
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :5.000   Max.   :75.000   Max.   :19.00  
##        G2              G3       
##  Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 9.00   1st Qu.: 8.00  
##  Median :11.00   Median :11.00  
##  Mean   :10.71   Mean   :10.42  
##  3rd Qu.:13.00   3rd Qu.:14.00  
##  Max.   :19.00   Max.   :20.00
str(df)
## 'data.frame':    395 obs. of  33 variables:
##  $ school    : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob      : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason    : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian  : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet  : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : int  6 6 10 15 10 15 11 6 19 15 ...
df$Medu <- as.factor(df$Medu)
df$Fedu <- as.factor(df$Fedu)
str(df)
## 'data.frame':    395 obs. of  33 variables:
##  $ school    : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : Factor w/ 5 levels "0","1","2","3",..: 5 2 2 5 4 5 3 5 4 4 ...
##  $ Fedu      : Factor w/ 5 levels "0","1","2","3",..: 5 2 2 3 4 4 3 5 3 5 ...
##  $ Mjob      : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob      : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason    : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian  : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet  : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : int  6 6 10 15 10 15 11 6 19 15 ...
any(is.na(df))
## [1] FALSE

Vamos a calcular la correlación entre las columnas numéricas del dataframe.

num.cols <- sapply(df, is.numeric)
cor.data <- cor(df[, num.cols])
cor.data
##                     age   traveltime    studytime    failures      famrel
## age         1.000000000  0.070640721 -0.004140037  0.24366538  0.05394010
## traveltime  0.070640721  1.000000000 -0.100909119  0.09223875 -0.01680799
## studytime  -0.004140037 -0.100909119  1.000000000 -0.17356303  0.03973070
## failures    0.243665377  0.092238746 -0.173563031  1.00000000 -0.04433663
## famrel      0.053940096 -0.016807986  0.039730704 -0.04433663  1.00000000
## freetime    0.016434389 -0.017024944 -0.143198407  0.09198747  0.15070144
## goout       0.126963880  0.028539674 -0.063903675  0.12456092  0.06456841
## Dalc        0.131124605  0.138325309 -0.196019263  0.13604693 -0.07759436
## Walc        0.117276052  0.134115752 -0.253784731  0.14196203 -0.11339731
## health     -0.062187369  0.007500606 -0.075615863  0.06582728  0.09405573
## absences    0.175230079 -0.012943775 -0.062700175  0.06372583 -0.04435409
## G1         -0.064081497 -0.093039992  0.160611915 -0.35471761  0.02216832
## G2         -0.143474049 -0.153197963  0.135879999 -0.35589563 -0.01828135
## G3         -0.161579438 -0.117142053  0.097819690 -0.36041494  0.05136343
##               freetime        goout        Dalc        Walc       health
## age         0.01643439  0.126963880  0.13112460  0.11727605 -0.062187369
## traveltime -0.01702494  0.028539674  0.13832531  0.13411575  0.007500606
## studytime  -0.14319841 -0.063903675 -0.19601926 -0.25378473 -0.075615863
## failures    0.09198747  0.124560922  0.13604693  0.14196203  0.065827282
## famrel      0.15070144  0.064568411 -0.07759436 -0.11339731  0.094055728
## freetime    1.00000000  0.285018715  0.20900085  0.14782181  0.075733357
## goout       0.28501871  1.000000000  0.26699385  0.42038575 -0.009577254
## Dalc        0.20900085  0.266993848  1.00000000  0.64754423  0.077179582
## Walc        0.14782181  0.420385745  0.64754423  1.00000000  0.092476317
## health      0.07573336 -0.009577254  0.07717958  0.09247632  1.000000000
## absences   -0.05807792  0.044302220  0.11190803  0.13629110 -0.029936711
## G1          0.01261293 -0.149103967 -0.09415879 -0.12617921 -0.073172073
## G2         -0.01377714 -0.162250034 -0.06412018 -0.08492735 -0.097719866
## G3          0.01130724 -0.132791474 -0.05466004 -0.05193932 -0.061334605
##               absences          G1          G2          G3
## age         0.17523008 -0.06408150 -0.14347405 -0.16157944
## traveltime -0.01294378 -0.09303999 -0.15319796 -0.11714205
## studytime  -0.06270018  0.16061192  0.13588000  0.09781969
## failures    0.06372583 -0.35471761 -0.35589563 -0.36041494
## famrel     -0.04435409  0.02216832 -0.01828135  0.05136343
## freetime   -0.05807792  0.01261293 -0.01377714  0.01130724
## goout       0.04430222 -0.14910397 -0.16225003 -0.13279147
## Dalc        0.11190803 -0.09415879 -0.06412018 -0.05466004
## Walc        0.13629110 -0.12617921 -0.08492735 -0.05193932
## health     -0.02993671 -0.07317207 -0.09771987 -0.06133460
## absences    1.00000000 -0.03100290 -0.03177670  0.03424732
## G1         -0.03100290  1.00000000  0.85211807  0.80146793
## G2         -0.03177670  0.85211807  1.00000000  0.90486799
## G3          0.03424732  0.80146793  0.90486799  1.00000000

Lo graficamos para verlo mejor.

corrplot(cor.data, method="square",title="Regression between numeric columns")

La función corrgram crea una gráfica similar a la anterior pero no es necesario pasar las columnas numéricas, sino que él mismo descarta las que no lo son.

corrgram(df, lower.panel=panel.shade, upper.panel=panel.pie, order=TRUE)

ggplot(df, aes(G3)) + geom_histogram(bins=20, alpha=0.4, fill="blue")

A continuación, vamos a tomar una parte del dataset para entrenar y otra para testear. Esto es frecuente en los procesos de Machine Learning.

set.seed(101)
sample <- sample.split(df$G3, SplitRatio = 0.7)
train <- df[sample, ]
test <- df[!sample, ]

Construímos el modelo. La sintaxis por defecto es la siguiente.

#model <- lm(y~x1+x2+...+xn, data)
#model <- lm(y~., data) #Si queremos usar todas las columnas para predecir y

Como queremos predecir el valor de G3 creamos el modelo de la siguiente forma.

model <- lm(G3 ~., train)
summary(model)
## 
## Call:
## lm(formula = G3 ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5680 -0.5674  0.3084  1.0395  5.2257 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.23327    3.67266   1.697  0.09102 .  
## schoolMS          0.69599    0.48779   1.427  0.15499    
## sexM              0.23721    0.30177   0.786  0.43264    
## age              -0.38947    0.13193  -2.952  0.00349 ** 
## addressU          0.03183    0.36384   0.087  0.93037    
## famsizeLE3        0.18457    0.29250   0.631  0.52867    
## PstatusT          0.10879    0.43788   0.248  0.80402    
## Medu1            -0.92880    1.27096  -0.731  0.46566    
## Medu2            -0.86969    1.27146  -0.684  0.49466    
## Medu3            -0.70779    1.30179  -0.544  0.58717    
## Medu4            -0.55480    1.35140  -0.411  0.68180    
## Fedu1            -1.03764    2.10244  -0.494  0.62210    
## Fedu2            -1.66656    2.12441  -0.784  0.43357    
## Fedu3            -1.39553    2.12795  -0.656  0.51260    
## Fedu4            -1.72018    2.16171  -0.796  0.42700    
## Mjobhealth       -0.58308    0.66478  -0.877  0.38136    
## Mjobother        -0.12112    0.43183  -0.280  0.77935    
## Mjobservices     -0.08431    0.47839  -0.176  0.86027    
## Mjobteacher      -0.12475    0.64084  -0.195  0.84583    
## Fjobhealth        0.22792    0.85260   0.267  0.78946    
## Fjobother        -0.31402    0.57000  -0.551  0.58223    
## Fjobservices     -0.71569    0.61012  -1.173  0.24200    
## Fjobteacher      -0.29948    0.76830  -0.390  0.69705    
## reasonhome       -0.40672    0.32392  -1.256  0.21054    
## reasonother       0.01506    0.45958   0.033  0.97389    
## reasonreputation  0.06901    0.35269   0.196  0.84504    
## guardianmother   -0.04171    0.31929  -0.131  0.89618    
## guardianother     0.03412    0.58980   0.058  0.95392    
## traveltime       -0.04467    0.19926  -0.224  0.82281    
## studytime        -0.03176    0.17164  -0.185  0.85338    
## failures         -0.21655    0.19962  -1.085  0.27915    
## schoolsupyes      0.16460    0.43019   0.383  0.70236    
## famsupyes        -0.04813    0.28019  -0.172  0.86378    
## paidyes           0.31374    0.28732   1.092  0.27599    
## activitiesyes    -0.27550    0.27107  -1.016  0.31053    
## nurseryyes       -0.04243    0.31680  -0.134  0.89357    
## higheryes        -0.87779    0.75085  -1.169  0.24360    
## internetyes      -0.17470    0.37379  -0.467  0.64068    
## romanticyes      -0.34177    0.29018  -1.178  0.24011    
## famrel            0.36317    0.14722   2.467  0.01437 *  
## freetime          0.07885    0.14381   0.548  0.58403    
## goout            -0.11117    0.13454  -0.826  0.40952    
## Dalc             -0.12917    0.21293  -0.607  0.54471    
## Walc              0.18950    0.15141   1.252  0.21200    
## health            0.07085    0.09463   0.749  0.45479    
## absences          0.09865    0.02405   4.101 5.71e-05 ***
## G1                0.15215    0.08008   1.900  0.05869 .  
## G2                0.97610    0.07119  13.711  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.973 on 229 degrees of freedom
## Multiple R-squared:  0.8478, Adjusted R-squared:  0.8166 
## F-statistic: 27.15 on 47 and 229 DF,  p-value: < 2.2e-16

Guardamos los residuos en una variable para graficar y ver si tiene una distribución normal. Observamos que con el comando residuals obtenemos los valores de todos los residuos de forma numérica. Por ello, posteriormente lo guardamos como un data frame.

res <- residuals(model)
class(res)
## [1] "numeric"
res <- as.data.frame(res)
head(res)
##           res
## 1   1.5432735
## 2   1.7087115
## 3   1.1151504
## 6  -2.4530215
## 9   0.7903935
## 11  0.8357748
ggplot(res, aes(res)) + geom_histogram(fill="blue", alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Podemos observar que el modelo predice que algunos estudiantes obtendrán una nota tan pobre que les da un valor negativo. Sin embargo, eso no es correcto pues en la realidad la nota más baja que puede obtener un alumno es un 0.

G3.predictions <- predict(model, data=test)
G3.predictions
##            1            2            3            6            9 
##  4.456726482  4.291288514  8.884849587 17.453021523 18.209606455 
##           11           12           14           15           16 
##  8.164225170 12.371583437 11.095988948 15.336651321 13.207405840 
##           17           18           19           20           22 
## 13.390420031  9.901018403  4.509466495  9.178136766 16.039007899 
##           23           24           26           27           28 
## 16.034059323 13.256356134  7.232358220 12.171899362 15.668960408 
##           29           31           33           34           35 
## 10.757332168 10.591917912 16.995210880  9.947549852 13.961861190 
##           36           37           38           41           43 
##  6.318656852 16.394519684 15.338963188  9.567120813 19.011759894 
##           44           45           46           47           48 
##  7.555182187 10.588592357  8.771281180 11.397442877 19.037032482 
##           50           51           52           53           55 
##  6.043237199 12.583809443 12.964350113 12.407153128 13.907480763 
##           56           58           59           60           63 
##  8.758891385 15.878150998  9.677403572 16.018316185  8.588320539 
##           66           67           68           69           70 
## 15.269956119 11.528459446  7.372665182  8.179447742 18.560248806 
##           72           73           74           76           78 
##  9.352147228  5.313063220 12.995412690  9.484828994 10.606280227 
##           80           81           82           83           85 
##  4.103160841 11.459776881 11.169543763  6.679714769  9.627326099 
##           87           88           89           92           93 
##  6.439021972 14.611134797 10.567447358 18.301286784  5.588390806 
##           94           95           96           97           98 
##  8.659191736 13.541170506  8.498540400 14.542496667  8.401000694 
##           99          100          102          103          104 
## 13.914471488  8.843943331 16.641989016 13.340363939  7.315207361 
##          106          107          108          109          110 
## 12.512776738  8.290636426 18.247658126 11.345512264 15.168955249 
##          111          112          113          115          117 
## 20.753846598  8.404735442 11.476421246  9.005006390 12.942613489 
##          118          120          122          123          124 
## 13.508020118 13.819520334 14.810758924 13.101083950 12.257056180 
##          125          126          127          128          130 
##  5.642481037 12.794851459  8.427592179  7.518797715 19.471871354 
##          131          132          133          134          135 
## -0.725496483 -0.567819598 12.553262321 12.296375519 -0.081401464 
##          137          138          139          140          141 
##  0.007674714 -2.252726699 12.531202584 16.232833521  7.568003636 
##          142          143          145          146          147 
##  7.159388676 10.236340511 -1.729635246 10.895199338  3.673349869 
##          148          150          151          152          153 
## 11.023052819  8.879038981  3.229979499 13.815255706  9.729164271 
##          157          158          159          160          163 
## 12.276739107  7.282085202 15.687936583 11.459605014 -0.799043779 
##          167          168          169          171          172 
## 10.676422043 13.305717992  5.255692709  2.825332896 16.000000000 
##          174          175          179          180          181 
##  3.682511643 10.767274680  7.678156222  9.213603848  8.087854026 
##          183          185          186          188          190 
## 16.681391410 12.938109427 11.752578184 15.477992692  7.834509103 
##          192          193          195          196          197 
##  6.568226638  7.727306424 13.691565729 13.380276719 15.578318897 
##          198          199          200          201          203 
##  8.910325753 19.458683059  8.958762442 15.662445164  8.971673676 
##          205          206          207          208          209 
##  9.308396802  9.861236931  5.685453261 11.101911573  9.833891652 
##          210          211          212          214          215 
##  6.107236059  7.092758199 12.735087311  6.190821599 10.961434407 
##          217          218          219          220          222 
##  6.089938516  5.902347953  6.018875767  9.154605842  3.010194349 
##          223          226          227          228          229 
## 15.802065714  7.960621320 15.787503466 11.414366569  8.819661208 
##          231          232          237          238          239 
## 12.763118593 10.290339335 12.677148699 13.541692138  9.542325266 
##          241          242          243          246          247 
## 11.888439907 10.338465081 -0.509856668 19.334508618 11.960549689 
##          248          250          251          253          254 
##  6.476233810 16.376271110  7.019350807  8.226195936  8.446870685 
##          255          256          257          258          259 
## 11.775222950  7.487981029 11.682266572 11.332196127 14.130072131 
##          264          265          266          267          268 
##  7.431221454  6.835508894 17.509684151  9.075945323  9.460252776 
##          269          270          271          272          273 
##  8.658350557 -1.234748014  7.317821179 12.926471489 10.248446449 
##          274          275          276          280          281 
## 13.174725278  9.061026200 11.920562167 10.420313291 10.348888287 
##          282          284          285          286          287 
##  9.254172607  7.581751863  8.411489871 10.483642895 18.380242980 
##          288          289          290          293          296 
## 12.372368642 14.010254671 13.650698029 12.757375756 12.101569838 
##          297          298          299          303          304 
##  5.923842198  7.386585534 12.493939410 11.733804040 17.393824706 
##          305          306          310          311          314 
## 13.732023580 11.702801936 10.893922898  5.223966850 10.608946365 
##          315          317          318          319          320 
## 12.830277503  6.823968152  9.027092585 10.073801763 10.137189333 
##          321          322          324          325          326 
## 14.864336606  8.624791275 13.292564141 15.104931175 10.405892271 
##          327          328          331          332          334 
## 14.788016320  9.649662198  6.702321113 13.223331219  5.009831421 
##          335          336          337          338          340 
##  7.001068028 16.310150621 14.073067754  6.008333595  8.749413095 
##          341          342          343          346          348 
##  9.816981004  7.235822395 15.000463821 12.085533986  9.381139380 
##          349          352          353          354          355 
## 14.122977568 13.451301775  7.166505174  6.855052538 11.281938765 
##          357          358          362          363          364 
## 12.928536173 10.892190848 13.484770264 10.021287759 14.487272751 
##          365          368          370          371          372 
## 10.004371046  5.192417712 12.092160423  3.774270467 12.436492767 
##          373          374          375          376          377 
## 10.514193129  4.510637093 18.528146558  7.032121631 12.862650696 
##          378          380          382          384          385 
##  8.596814539 11.957187566  5.747896292  3.841449055  5.494126323 
##          387          388          389          391          393 
##  4.436302804  3.706941874  7.904610140  8.814542445  7.021464401 
##          394          395 
## 12.185212665  8.777134327
results <- cbind(G3.predictions, test$G3)
## Warning in cbind(G3.predictions, test$G3): number of rows of result is not
## a multiple of vector length (arg 2)
colnames(results) <- c('predicted','actual')
results <- as.data.frame(results)
head(results)
##    predicted actual
## 1   4.456726     15
## 2   4.291289     10
## 3   8.884850     11
## 6  17.453022      6
## 9  18.209606     15
## 11  8.164225     14

Modificamos las notas que son menores que 0 y las dejamos en 0.

to_zero <- function(x){
  if (x<0){
    return (0)}
  else{
    return (x)}
  }
results$predicted <- sapply(results$predicted, to_zero)
summary(results)
##    predicted          actual     
##  Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 7.582   1st Qu.: 8.00  
##  Median :10.484   Median :11.00  
##  Mean   :10.444   Mean   :10.45  
##  3rd Qu.:13.223   3rd Qu.:14.00  
##  Max.   :20.754   Max.   :19.00

Podemos evaluar los resultados que hemos predicho de distintas maneras. Una de ellas es con el cálculo de la Mean Squared Error (MSE).

mse <- mean( (results$actual-results$predicted)^2 )
mse
## [1] 33.97113

O su raíz cuadrada.

rmse <- mse^0.5
rmse
## [1] 5.828476

Otro método es el cálculo del R-Squared Value

SSE <- sum(results$predicted - results$actual)^2
SST <- sum( (mean(df$G3)-results$actual)^2)
R2 <- 1 - SSE/SST
R2
## [1] 0.9992326