library(data.table)
library(rpart.plot)
## Loading required package: rpart
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(jtools)
library(ggplot2)
library(moderndive)
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
library(knitr)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dbscan)
datos<- fread("DATOS CONTROL 3.csv")
datos<- as.data.table(datos)
class(datos)
## [1] "data.table" "data.frame"
str(datos)
## Classes 'data.table' and 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : int 1 1 0 1 0 1 0 1 1 1 ...
## $ cp : int 3 2 1 1 0 0 1 1 2 2 ...
## $ trtbps : int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalachh: int 150 187 172 178 163 148 153 173 162 174 ...
## $ exng : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slp : int 0 0 2 2 2 1 1 2 2 2 ...
## $ caa : int 0 0 0 0 0 0 0 0 0 0 ...
## $ thall : int 1 2 2 2 2 1 2 3 3 2 ...
## $ output : int 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
datos$sex <- as.factor(datos$sex)
datos$cp <- as.factor(datos$cp)
datos$fbs <- as.factor(datos$fbs)
datos$restecg <- as.factor(datos$restecg)
datos$exng <- as.factor(datos$exng)
datos$output <- as.factor(datos$output)
datos$caa <- as.factor(datos$caa)
###P2 ###Realice dos modelos de regresión lineal multiple para predecir la Presión arterial en reposo ¿Cuál predice mejor dentro de muestra?. (8 puntos)
f01 <- formula(trtbps~exng+sex)
reg_1 <- lm(f01,data=datos)
summary(reg_1)
##
## Call:
## lm(formula = f01, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.757 -11.875 -2.423 8.125 64.695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 132.423 1.855 71.368 <2e-16 ***
## exng1 2.882 2.167 1.330 0.185
## sex1 -2.548 2.185 -1.166 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.52 on 300 degrees of freedom
## Multiple R-squared: 0.009064, Adjusted R-squared: 0.002458
## F-statistic: 1.372 on 2 and 300 DF, p-value: 0.2552
datos[,prediccion1:=predict(reg_1)]
pred1<-predict(reg_1)
f02 <- formula(trtbps~exng+sex+thalachh+cp+restecg+chol+age)
reg_2<- lm(f02,data=datos)
summary(reg_2)
##
## Call:
## lm(formula = f02, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.609 -10.499 -1.098 9.068 64.272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.25841 12.54257 7.116 8.61e-12 ***
## exng1 2.97755 2.42231 1.229 0.2200
## sex1 -1.45528 2.17179 -0.670 0.5033
## thalachh 0.07400 0.05130 1.442 0.1502
## cp1 -0.93997 3.07654 -0.306 0.7602
## cp2 0.04790 2.54536 0.019 0.9850
## cp3 9.08036 3.92771 2.312 0.0215 *
## restecg1 -2.93021 2.00125 -1.464 0.1442
## restecg2 4.92172 8.58967 0.573 0.5671
## chol 0.01495 0.01977 0.756 0.4500
## age 0.52399 0.12074 4.340 1.97e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.72 on 292 degrees of freedom
## Multiple R-squared: 0.1208, Adjusted R-squared: 0.09072
## F-statistic: 4.013 on 10 and 292 DF, p-value: 3.631e-05
datos[,prediccion2:=predict(reg_2)]
pred2<-predict(reg_2)
data.table(RMSE=RMSE(pred1,datos$trtbps,na.rm = T),
MAE=MAE(pred1,datos$trtbps,na.rm = T))
## RMSE MAE
## 1: 17.42965 13.43322
data.table(RMSE=RMSE(pred2,datos$trtbps,na.rm = T),
MAE=MAE(pred2,datos$trtbps,na.rm = T))
## RMSE MAE
## 1: 16.41733 12.5876
set.seed(12345)
setupKCV <- trainControl(method = "cv" , number = 5)
predkfolds1 <- train(trtbps~exng+sex, data=datos, method="lm", trControl= setupKCV)
predkfolds2 <- train(trtbps~exng+sex+thalachh+cp+restecg+chol+age, data=datos, method="lm", trControl= setupKCV)
print(predkfolds1)
## Linear Regression
##
## 303 samples
## 2 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 242, 242, 242, 244, 242
## Resampling results:
##
## RMSE Rsquared MAE
## 17.50709 0.01668347 13.54008
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
print(predkfolds2)
## Linear Regression
##
## 303 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 243, 241, 242, 243, 243
## Resampling results:
##
## RMSE Rsquared MAE
## 16.95372 0.08369827 13.16558
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
ggplot(data = datos,aes(x=age,y=thalachh))+
geom_point()
data1=datos[!thalachh>200,]
data1=data1[!thalachh<80,]
data1=data1[!age>70,]
ggplot(data = data1,aes(x=age,y=thalachh))+
geom_point()
datakmeans<-data1[,.(age,thalachh)]
k1<-kmeans(x=datakmeans,centers=5 ,nstart=25)
fviz_cluster(k1,data=datakmeans,geom = "point")
### P5 ###Realice dos modelos de árboles de clasificación de la variable output. Pruebe cuál modelo clasifica mejor con validación cruzada. Entrene el modelo con un 80% de la muestra y testee con el 20% restante. Explicite qué modelo es mejor y porqué. (12 puntos)
set.seed(12345)
arbol1 <- rpart(output~sex+ age ,data= datos,method = 'class')
rpart.plot(arbol1)
set.seed(12345)
arbol2 <- rpart(output~sex+exng+age,data= datos,method = 'class')
rpart.plot(arbol2)
set.seed(12345)
div <- createDataPartition(datos$output, times = 1, p = 0.8, list = F)
train <- datos[div,] ##base de entrenamiento, todas las observaciones que estan en div
test <- datos[-div,]
arbol_1 <- rpart(output~sex+ age, data = train, method = "class")
rpart.plot(arbol_1)
prediccion_1 <- predict(arbol_1, newdata = test, type = "class")
matriz <- table(test$output, prediccion_1)
matriz
## prediccion_1
## 0 1
## 0 19 8
## 1 12 21
precision <- sum(diag(matriz))/sum(matriz)
precision
## [1] 0.6666667
set.seed(12345)
div <- createDataPartition(datos$output, times = 1, p = 0.8, list = F)
train <- datos[div,] ##base de entrenamiento, todas las observaciones que estan en div
test <- datos[-div,]
arbol_2 <- rpart(output~sex+exng+age, data = train, method = "class")
rpart.plot(arbol_2)
prediccion_2 <- predict(arbol_2, newdata = test, type = "class")
matriz <- table(test$output, prediccion_2)
matriz
## prediccion_2
## 0 1
## 0 21 6
## 1 9 24
precision <- sum(diag(matriz))/sum(matriz)
precision
## [1] 0.75