Regresión linear con R
library(dplyr)
library(ggplot2)
library(ggthemes)
library(corrgram)
library(corrplot)
library(caTools)
setwd("~/R/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Machine Learning with R")
df <- read.csv("student-mat.csv", sep=";")
head(df)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 GP F 15 U LE3 T 1 1 at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 services other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 yes no no
## 2 course father 1 2 0 no yes no
## 3 other mother 1 2 3 yes no yes
## 4 home mother 1 3 0 no yes yes
## 5 home father 1 2 0 no yes yes
## 6 reputation mother 1 2 0 no yes yes
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no no 4 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes yes yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## Walc health absences G1 G2 G3
## 1 1 3 6 5 6 6
## 2 1 3 4 5 5 6
## 3 3 3 10 7 8 10
## 4 1 5 2 15 14 15
## 5 2 5 4 6 10 10
## 6 2 5 10 15 15 15
summary(df)
## school sex age address famsize Pstatus Medu
## GP:349 F:208 Min. :15.0 R: 88 GT3:281 A: 41 Min. :0.000
## MS: 46 M:187 1st Qu.:16.0 U:307 LE3:114 T:354 1st Qu.:2.000
## Median :17.0 Median :3.000
## Mean :16.7 Mean :2.749
## 3rd Qu.:18.0 3rd Qu.:4.000
## Max. :22.0 Max. :4.000
## Fedu Mjob Fjob reason
## Min. :0.000 at_home : 59 at_home : 20 course :145
## 1st Qu.:2.000 health : 34 health : 18 home :109
## Median :2.000 other :141 other :217 other : 36
## Mean :2.522 services:103 services:111 reputation:105
## 3rd Qu.:3.000 teacher : 58 teacher : 29
## Max. :4.000
## guardian traveltime studytime failures schoolsup
## father: 90 Min. :1.000 Min. :1.000 Min. :0.0000 no :344
## mother:273 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 yes: 51
## other : 32 Median :1.000 Median :2.000 Median :0.0000
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery higher internet romantic
## no :153 no :214 no :194 no : 81 no : 20 no : 66 no :263
## yes:242 yes:181 yes:201 yes:314 yes:375 yes:329 yes:132
##
##
##
##
## famrel freetime goout Dalc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000
## Median :4.000 Median :3.000 Median :3.000 Median :1.000
## Mean :3.944 Mean :3.235 Mean :3.109 Mean :1.481
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Walc health absences G1
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. : 3.00
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00
## Median :2.000 Median :4.000 Median : 4.000 Median :11.00
## Mean :2.291 Mean :3.554 Mean : 5.709 Mean :10.91
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00
## Max. :5.000 Max. :5.000 Max. :75.000 Max. :19.00
## G2 G3
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.00 1st Qu.: 8.00
## Median :11.00 Median :11.00
## Mean :10.71 Mean :10.42
## 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :19.00 Max. :20.00
str(df)
## 'data.frame': 395 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : int 6 6 10 15 10 15 11 6 19 15 ...
df$Medu <- as.factor(df$Medu)
df$Fedu <- as.factor(df$Fedu)
str(df)
## 'data.frame': 395 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : Factor w/ 5 levels "0","1","2","3",..: 5 2 2 5 4 5 3 5 4 4 ...
## $ Fedu : Factor w/ 5 levels "0","1","2","3",..: 5 2 2 3 4 4 3 5 3 5 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : int 6 6 10 15 10 15 11 6 19 15 ...
any(is.na(df))
## [1] FALSE
Vamos a calcular la correlación entre las columnas numéricas del dataframe.
num.cols <- sapply(df, is.numeric)
cor.data <- cor(df[, num.cols])
cor.data
## age traveltime studytime failures famrel
## age 1.000000000 0.070640721 -0.004140037 0.24366538 0.05394010
## traveltime 0.070640721 1.000000000 -0.100909119 0.09223875 -0.01680799
## studytime -0.004140037 -0.100909119 1.000000000 -0.17356303 0.03973070
## failures 0.243665377 0.092238746 -0.173563031 1.00000000 -0.04433663
## famrel 0.053940096 -0.016807986 0.039730704 -0.04433663 1.00000000
## freetime 0.016434389 -0.017024944 -0.143198407 0.09198747 0.15070144
## goout 0.126963880 0.028539674 -0.063903675 0.12456092 0.06456841
## Dalc 0.131124605 0.138325309 -0.196019263 0.13604693 -0.07759436
## Walc 0.117276052 0.134115752 -0.253784731 0.14196203 -0.11339731
## health -0.062187369 0.007500606 -0.075615863 0.06582728 0.09405573
## absences 0.175230079 -0.012943775 -0.062700175 0.06372583 -0.04435409
## G1 -0.064081497 -0.093039992 0.160611915 -0.35471761 0.02216832
## G2 -0.143474049 -0.153197963 0.135879999 -0.35589563 -0.01828135
## G3 -0.161579438 -0.117142053 0.097819690 -0.36041494 0.05136343
## freetime goout Dalc Walc health
## age 0.01643439 0.126963880 0.13112460 0.11727605 -0.062187369
## traveltime -0.01702494 0.028539674 0.13832531 0.13411575 0.007500606
## studytime -0.14319841 -0.063903675 -0.19601926 -0.25378473 -0.075615863
## failures 0.09198747 0.124560922 0.13604693 0.14196203 0.065827282
## famrel 0.15070144 0.064568411 -0.07759436 -0.11339731 0.094055728
## freetime 1.00000000 0.285018715 0.20900085 0.14782181 0.075733357
## goout 0.28501871 1.000000000 0.26699385 0.42038575 -0.009577254
## Dalc 0.20900085 0.266993848 1.00000000 0.64754423 0.077179582
## Walc 0.14782181 0.420385745 0.64754423 1.00000000 0.092476317
## health 0.07573336 -0.009577254 0.07717958 0.09247632 1.000000000
## absences -0.05807792 0.044302220 0.11190803 0.13629110 -0.029936711
## G1 0.01261293 -0.149103967 -0.09415879 -0.12617921 -0.073172073
## G2 -0.01377714 -0.162250034 -0.06412018 -0.08492735 -0.097719866
## G3 0.01130724 -0.132791474 -0.05466004 -0.05193932 -0.061334605
## absences G1 G2 G3
## age 0.17523008 -0.06408150 -0.14347405 -0.16157944
## traveltime -0.01294378 -0.09303999 -0.15319796 -0.11714205
## studytime -0.06270018 0.16061192 0.13588000 0.09781969
## failures 0.06372583 -0.35471761 -0.35589563 -0.36041494
## famrel -0.04435409 0.02216832 -0.01828135 0.05136343
## freetime -0.05807792 0.01261293 -0.01377714 0.01130724
## goout 0.04430222 -0.14910397 -0.16225003 -0.13279147
## Dalc 0.11190803 -0.09415879 -0.06412018 -0.05466004
## Walc 0.13629110 -0.12617921 -0.08492735 -0.05193932
## health -0.02993671 -0.07317207 -0.09771987 -0.06133460
## absences 1.00000000 -0.03100290 -0.03177670 0.03424732
## G1 -0.03100290 1.00000000 0.85211807 0.80146793
## G2 -0.03177670 0.85211807 1.00000000 0.90486799
## G3 0.03424732 0.80146793 0.90486799 1.00000000
Lo graficamos para verlo mejor.
corrplot(cor.data, method="square",title="Regression between numeric columns")
La función corrgram crea una gráfica similar a la anterior pero no es necesario pasar las columnas numéricas, sino que él mismo descarta las que no lo son.
corrgram(df, lower.panel=panel.shade, upper.panel=panel.pie, order=TRUE)
ggplot(df, aes(G3)) + geom_histogram(bins=20, alpha=0.4, fill="blue")
A continuación, vamos a tomar una parte del dataset para entrenar y otra para testear. Esto es frecuente en los procesos de Machine Learning.
set.seed(101)
sample <- sample.split(df$G3, SplitRatio = 0.7)
train <- df[sample, ]
test <- df[!sample, ]
Construímos el modelo. La sintaxis por defecto es la siguiente.
#model <- lm(y~x1+x2+...+xn, data)
#model <- lm(y~., data) #Si queremos usar todas las columnas para predecir y
Como queremos predecir el valor de G3 creamos el modelo de la siguiente forma.
model <- lm(G3 ~., train)
summary(model)
##
## Call:
## lm(formula = G3 ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5680 -0.5674 0.3084 1.0395 5.2257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.23327 3.67266 1.697 0.09102 .
## schoolMS 0.69599 0.48779 1.427 0.15499
## sexM 0.23721 0.30177 0.786 0.43264
## age -0.38947 0.13193 -2.952 0.00349 **
## addressU 0.03183 0.36384 0.087 0.93037
## famsizeLE3 0.18457 0.29250 0.631 0.52867
## PstatusT 0.10879 0.43788 0.248 0.80402
## Medu1 -0.92880 1.27096 -0.731 0.46566
## Medu2 -0.86969 1.27146 -0.684 0.49466
## Medu3 -0.70779 1.30179 -0.544 0.58717
## Medu4 -0.55480 1.35140 -0.411 0.68180
## Fedu1 -1.03764 2.10244 -0.494 0.62210
## Fedu2 -1.66656 2.12441 -0.784 0.43357
## Fedu3 -1.39553 2.12795 -0.656 0.51260
## Fedu4 -1.72018 2.16171 -0.796 0.42700
## Mjobhealth -0.58308 0.66478 -0.877 0.38136
## Mjobother -0.12112 0.43183 -0.280 0.77935
## Mjobservices -0.08431 0.47839 -0.176 0.86027
## Mjobteacher -0.12475 0.64084 -0.195 0.84583
## Fjobhealth 0.22792 0.85260 0.267 0.78946
## Fjobother -0.31402 0.57000 -0.551 0.58223
## Fjobservices -0.71569 0.61012 -1.173 0.24200
## Fjobteacher -0.29948 0.76830 -0.390 0.69705
## reasonhome -0.40672 0.32392 -1.256 0.21054
## reasonother 0.01506 0.45958 0.033 0.97389
## reasonreputation 0.06901 0.35269 0.196 0.84504
## guardianmother -0.04171 0.31929 -0.131 0.89618
## guardianother 0.03412 0.58980 0.058 0.95392
## traveltime -0.04467 0.19926 -0.224 0.82281
## studytime -0.03176 0.17164 -0.185 0.85338
## failures -0.21655 0.19962 -1.085 0.27915
## schoolsupyes 0.16460 0.43019 0.383 0.70236
## famsupyes -0.04813 0.28019 -0.172 0.86378
## paidyes 0.31374 0.28732 1.092 0.27599
## activitiesyes -0.27550 0.27107 -1.016 0.31053
## nurseryyes -0.04243 0.31680 -0.134 0.89357
## higheryes -0.87779 0.75085 -1.169 0.24360
## internetyes -0.17470 0.37379 -0.467 0.64068
## romanticyes -0.34177 0.29018 -1.178 0.24011
## famrel 0.36317 0.14722 2.467 0.01437 *
## freetime 0.07885 0.14381 0.548 0.58403
## goout -0.11117 0.13454 -0.826 0.40952
## Dalc -0.12917 0.21293 -0.607 0.54471
## Walc 0.18950 0.15141 1.252 0.21200
## health 0.07085 0.09463 0.749 0.45479
## absences 0.09865 0.02405 4.101 5.71e-05 ***
## G1 0.15215 0.08008 1.900 0.05869 .
## G2 0.97610 0.07119 13.711 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.973 on 229 degrees of freedom
## Multiple R-squared: 0.8478, Adjusted R-squared: 0.8166
## F-statistic: 27.15 on 47 and 229 DF, p-value: < 2.2e-16
Guardamos los residuos en una variable para graficar y ver si tiene una distribución normal. Observamos que con el comando residuals obtenemos los valores de todos los residuos de forma numérica. Por ello, posteriormente lo guardamos como un data frame.
res <- residuals(model)
class(res)
## [1] "numeric"
res <- as.data.frame(res)
head(res)
## res
## 1 1.5432735
## 2 1.7087115
## 3 1.1151504
## 6 -2.4530215
## 9 0.7903935
## 11 0.8357748
ggplot(res, aes(res)) + geom_histogram(fill="blue", alpha=0.5)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Podemos observar que el modelo predice que algunos estudiantes obtendrán una nota tan pobre que les da un valor negativo. Sin embargo, eso no es correcto pues en la realidad la nota más baja que puede obtener un alumno es un 0.
G3.predictions <- predict(model, data=test)
G3.predictions
## 1 2 3 6 9
## 4.456726482 4.291288514 8.884849587 17.453021523 18.209606455
## 11 12 14 15 16
## 8.164225170 12.371583437 11.095988948 15.336651321 13.207405840
## 17 18 19 20 22
## 13.390420031 9.901018403 4.509466495 9.178136766 16.039007899
## 23 24 26 27 28
## 16.034059323 13.256356134 7.232358220 12.171899362 15.668960408
## 29 31 33 34 35
## 10.757332168 10.591917912 16.995210880 9.947549852 13.961861190
## 36 37 38 41 43
## 6.318656852 16.394519684 15.338963188 9.567120813 19.011759894
## 44 45 46 47 48
## 7.555182187 10.588592357 8.771281180 11.397442877 19.037032482
## 50 51 52 53 55
## 6.043237199 12.583809443 12.964350113 12.407153128 13.907480763
## 56 58 59 60 63
## 8.758891385 15.878150998 9.677403572 16.018316185 8.588320539
## 66 67 68 69 70
## 15.269956119 11.528459446 7.372665182 8.179447742 18.560248806
## 72 73 74 76 78
## 9.352147228 5.313063220 12.995412690 9.484828994 10.606280227
## 80 81 82 83 85
## 4.103160841 11.459776881 11.169543763 6.679714769 9.627326099
## 87 88 89 92 93
## 6.439021972 14.611134797 10.567447358 18.301286784 5.588390806
## 94 95 96 97 98
## 8.659191736 13.541170506 8.498540400 14.542496667 8.401000694
## 99 100 102 103 104
## 13.914471488 8.843943331 16.641989016 13.340363939 7.315207361
## 106 107 108 109 110
## 12.512776738 8.290636426 18.247658126 11.345512264 15.168955249
## 111 112 113 115 117
## 20.753846598 8.404735442 11.476421246 9.005006390 12.942613489
## 118 120 122 123 124
## 13.508020118 13.819520334 14.810758924 13.101083950 12.257056180
## 125 126 127 128 130
## 5.642481037 12.794851459 8.427592179 7.518797715 19.471871354
## 131 132 133 134 135
## -0.725496483 -0.567819598 12.553262321 12.296375519 -0.081401464
## 137 138 139 140 141
## 0.007674714 -2.252726699 12.531202584 16.232833521 7.568003636
## 142 143 145 146 147
## 7.159388676 10.236340511 -1.729635246 10.895199338 3.673349869
## 148 150 151 152 153
## 11.023052819 8.879038981 3.229979499 13.815255706 9.729164271
## 157 158 159 160 163
## 12.276739107 7.282085202 15.687936583 11.459605014 -0.799043779
## 167 168 169 171 172
## 10.676422043 13.305717992 5.255692709 2.825332896 16.000000000
## 174 175 179 180 181
## 3.682511643 10.767274680 7.678156222 9.213603848 8.087854026
## 183 185 186 188 190
## 16.681391410 12.938109427 11.752578184 15.477992692 7.834509103
## 192 193 195 196 197
## 6.568226638 7.727306424 13.691565729 13.380276719 15.578318897
## 198 199 200 201 203
## 8.910325753 19.458683059 8.958762442 15.662445164 8.971673676
## 205 206 207 208 209
## 9.308396802 9.861236931 5.685453261 11.101911573 9.833891652
## 210 211 212 214 215
## 6.107236059 7.092758199 12.735087311 6.190821599 10.961434407
## 217 218 219 220 222
## 6.089938516 5.902347953 6.018875767 9.154605842 3.010194349
## 223 226 227 228 229
## 15.802065714 7.960621320 15.787503466 11.414366569 8.819661208
## 231 232 237 238 239
## 12.763118593 10.290339335 12.677148699 13.541692138 9.542325266
## 241 242 243 246 247
## 11.888439907 10.338465081 -0.509856668 19.334508618 11.960549689
## 248 250 251 253 254
## 6.476233810 16.376271110 7.019350807 8.226195936 8.446870685
## 255 256 257 258 259
## 11.775222950 7.487981029 11.682266572 11.332196127 14.130072131
## 264 265 266 267 268
## 7.431221454 6.835508894 17.509684151 9.075945323 9.460252776
## 269 270 271 272 273
## 8.658350557 -1.234748014 7.317821179 12.926471489 10.248446449
## 274 275 276 280 281
## 13.174725278 9.061026200 11.920562167 10.420313291 10.348888287
## 282 284 285 286 287
## 9.254172607 7.581751863 8.411489871 10.483642895 18.380242980
## 288 289 290 293 296
## 12.372368642 14.010254671 13.650698029 12.757375756 12.101569838
## 297 298 299 303 304
## 5.923842198 7.386585534 12.493939410 11.733804040 17.393824706
## 305 306 310 311 314
## 13.732023580 11.702801936 10.893922898 5.223966850 10.608946365
## 315 317 318 319 320
## 12.830277503 6.823968152 9.027092585 10.073801763 10.137189333
## 321 322 324 325 326
## 14.864336606 8.624791275 13.292564141 15.104931175 10.405892271
## 327 328 331 332 334
## 14.788016320 9.649662198 6.702321113 13.223331219 5.009831421
## 335 336 337 338 340
## 7.001068028 16.310150621 14.073067754 6.008333595 8.749413095
## 341 342 343 346 348
## 9.816981004 7.235822395 15.000463821 12.085533986 9.381139380
## 349 352 353 354 355
## 14.122977568 13.451301775 7.166505174 6.855052538 11.281938765
## 357 358 362 363 364
## 12.928536173 10.892190848 13.484770264 10.021287759 14.487272751
## 365 368 370 371 372
## 10.004371046 5.192417712 12.092160423 3.774270467 12.436492767
## 373 374 375 376 377
## 10.514193129 4.510637093 18.528146558 7.032121631 12.862650696
## 378 380 382 384 385
## 8.596814539 11.957187566 5.747896292 3.841449055 5.494126323
## 387 388 389 391 393
## 4.436302804 3.706941874 7.904610140 8.814542445 7.021464401
## 394 395
## 12.185212665 8.777134327
results <- cbind(G3.predictions, test$G3)
## Warning in cbind(G3.predictions, test$G3): number of rows of result is not
## a multiple of vector length (arg 2)
colnames(results) <- c('predicted','actual')
results <- as.data.frame(results)
head(results)
## predicted actual
## 1 4.456726 15
## 2 4.291289 10
## 3 8.884850 11
## 6 17.453022 6
## 9 18.209606 15
## 11 8.164225 14
Modificamos las notas que son menores que 0 y las dejamos en 0.
to_zero <- function(x){
if (x<0){
return (0)}
else{
return (x)}
}
results$predicted <- sapply(results$predicted, to_zero)
summary(results)
## predicted actual
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 7.582 1st Qu.: 8.00
## Median :10.484 Median :11.00
## Mean :10.444 Mean :10.45
## 3rd Qu.:13.223 3rd Qu.:14.00
## Max. :20.754 Max. :19.00
Podemos evaluar los resultados que hemos predicho de distintas maneras. Una de ellas es con el cálculo de la Mean Squared Error (MSE).
mse <- mean( (results$actual-results$predicted)^2 )
mse
## [1] 33.97113
O su raíz cuadrada.
rmse <- mse^0.5
rmse
## [1] 5.828476
Otro método es el cálculo del R-Squared Value
SSE <- sum(results$predicted - results$actual)^2
SST <- sum( (mean(df$G3)-results$actual)^2)
R2 <- 1 - SSE/SST
R2
## [1] 0.9992326