library(faux)
##
## ************
## Welcome to faux. For support and examples visit:
## https://debruine.github.io/faux/
## - Get and set global package options with: faux_options()
## ************
library(readxl)
library(rgl)
Cargue de los datos
datos <- read_excel("D:/M_multivariados/discriminantes_051923.xlsx")
head(datos)
datos$g = as.factor(datos$g)
Ver gráfico en varias dimensiones
plot3d(datos[,-4], type='s',
col=as.numeric(datos$g))
pairs(datos[,-4], pch=16,
col=datos$g)
Gráfico
pairs(datos[,-4], pch = 16, col = datos$g)
Densidad es el patrón de distribución
library(ggplot2)
ggplot(datos)+
aes(DE, fill = g)+
geom_density(alpha=0.5)
Segunda variable
ggplot(datos)+
aes(DL, fill = g)+
geom_density(alpha=0.5)
Tercera varianble
ggplot(datos)+
aes(BIO, fill = g)+
geom_density(alpha=0.5)
#Densidad bivariada
ggplot(datos)+
aes(DE, DL, color = g)+
geom_density_2d()
Se observa la region de solapamiento. Lo que me dificulta la
clasificación de sana y enferma.
ggplot(datos)+
aes(DL, BIO, color = g)+
geom_density_2d()
Tercer gráfico
ggplot(datos)+
aes(DE, BIO, color = g)+
geom_density_2d()
#Crear los datos de entrenamiento
set.seed(123)
muest = sample(x = c(T, F), size = nrow(datos),
replace = T, prob = c(0.8,0.2))
X_train = datos[muest, -4]
y_train = datos[muest, 4]
X_test = datos[!muest, -4]
y_test = datos[!muest, 4]
#Construyendo el modelo de discriminación con los datos de entrenamiento
#plot3d(x_train, type = 's', col = as.numeric(y_train$g))
#Crear costos y pesos # Costos # C(2|1) = C(S|E) # C(1|2) = C(E|S) # C(1|2)/C(2|1) = C(E|S)/C(S|E) = 1/2
C12 = 1
C21 = 3
# Pesos
cont = as.vector(table(datos$g))
P1 = cont[1]/sum(cont)
P2 = cont[2]/sum(cont)
#Calcular el vector de medias de cada variable
mu1 = colMeans(X_train[y_train$g=='enferma',])
mu2 = colMeans(X_train[y_train$g=='sana',])
Diferencia de medias
mu_d= mu1 - mu2
mu_s = mu1 + mu2
#Matriz de varianza y covarianza
S1 = cov(X_train[y_train$g=='enferma',])
S2 = cov(X_train[y_train$g=='sana',])
S1
## DE DL BIO
## DE 0.07508443 0.07905612 1.865136
## DL 0.07905612 0.13726810 2.954543
## BIO 1.86513604 2.95454304 93.880361
S2
## DE DL BIO
## DE 0.08014479 0.09460662 1.940774
## DL 0.09460662 0.17398740 3.173743
## BIO 1.94077412 3.17374300 87.365875
#Matriz cojunta Sp
n1 = nrow(X_train[y_train$g=='enferma',])
n2 = nrow(X_train[y_train$g=='sana',])
Sp = ((n1-1)/((n1-1)+(n2-1)))*S1 + ((n2-1)/((n1-1)+(n2-1)))*S2
Spi = solve(Sp)
Sp
## DE DL BIO
## DE 0.07881644 0.09052461 1.920919
## DL 0.09052461 0.16434859 3.116203
## BIO 1.92091912 3.11620301 89.075928
X0 = unlist(X_test[1,])
model_izq = t(mu_d) %*% Spi %*% X0 - (1 / 2) * t(mu_d) %*% Spi %*% mu_s
model_der = log((C12 / C21) * (P2 / P1))
ifelse(model_izq >= model_der,
'enferma', 'sana')
## [,1]
## [1,] "enferma"
y_test[1,]
#Modelo de discriminación
X0 = unlist(X_test[1,])
model_izq = t(mu_d) %*% Spi %*% X0 - (1 / 2) * t(mu_d) %*% Spi %*% mu_s
model_der = log((C12/C21)*(P2/P1))
ifelse(model_izq >= model_der,
'enferma', 'sana')
## [,1]
## [1,] "enferma"
y_test[1,]
Revisión
data_1clas = rbind(X_train, X0)
g_1clas = c(y_train$g, 3)
plot3d(data_1clas, type='s',
col=g_1clas)
#La parte de predicción
part1 = t(mu_d) %*% Spi %*% t(X_test)
part2 = (1 / 2) * t(mu_d) %*% Spi %*% mu_s
mod_izq = part1 - part2[1,1]
y_pred = sapply(mod_izq, function(x0_i){
ifelse(x0_i >= model_der,
'enferma', 'sana')
})
Tabla
tbl = table(y_test$g, y_pred)
tbl
## y_pred
## enferma sana
## enferma 7 1
## sana 1 9
sum(diag(tbl)) / sum(tbl)
## [1] 0.8888889
Para exporta a R studio
#install.packages("openxlsx")
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.1
#Metodo para estandarizar las varibles 1 z-score
datos_zscore = scale(datos[,-4])
as.data.frame(datos_zscore)
Intentar exportar bien
write.xlsx(datos_zscore, "D:/M_multivariados/varian_2.xlsx" )
## Warning in file.create(to[okay]): cannot create file
## 'D:/M_multivariados/varian_2.xlsx', reason 'Permission denied'
2 Minimax (Si funcionó)
datos_minimax = apply(
datos[,-4], 2, function(ci){
(ci-min(ci))/(max(ci)-min(ci))
})
#####Correr los datos después de estandarizar las variables
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datos_st = apply(
datos[,-4], 2, function(ci){
(ci-min(ci))/(max(ci)-min(ci))
})
datos2 = cbind(datos_st,datos$g)
datos2 = as.data.frame(datos2)
datos2 <- datos2 %>%
mutate(V4 = recode(V4, `1` = "enferma", `2` = "sana"))
datos2$V4 = as.factor(datos2$V4)
as.data.frame(datos2)
#Crear los datos de entreamiento
set.seed(123)
muest2 = sample(x = c(T, F), size = nrow(datos2),
replace = T, prob = c(0.8,0.2))
muest2
## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE
## [13] TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [61] TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [97] TRUE TRUE TRUE TRUE
segunda parte
#####
set.seed(123)
muest = sample(x = c(T, F), size = nrow(datos2),
replace = T, prob = c(0.8,0.2))
X_train2 = datos2[muest2, -4]
#y_train = datos2[muest2, 4]
X_test2 = datos2[!muest2, -4]
#y_test = datos2[!muest2, 4]
#Construyendo el modelo de discriminación
C12a = 1
C21a = 3
# Pesos
cont2 = as.vector(table(datos2$V4))
P1a = cont[1]/sum(cont2)
P2a = cont[2]/sum(cont2)
mu12 = colMeans(X_train2[y_train$g == 'enferma',])
mu22 = colMeans(X_train2[y_train$g =='sana',])
mu_d2 = mu12 - mu22
mu_s2 = mu12 + mu22
#Matriz de varianza y covarianza
S12 = cov(X_train2[y_train$g == 'enferma',])
S22 = cov(X_train2[y_train$g=='sana',])
n12 = nrow(X_train2[y_train$g == 'enferma',])
n22 = nrow(X_train2[y_train$g=='sana',])
Sp2 = ((n12-1)/((n12-1)+(n22-1)))*S12 + ((n22-1)/((n12-1)+(n22-1)))*S22
Spi2 = solve(Sp2)
Spi2
## DE DL BIO
## DE 76.90188 -59.17864 -22.16143
## DL -59.17864 173.20108 -86.33720
## BIO -22.16143 -86.33720 132.62245
Revisando
X02 = unlist(X_test2[1,])
model_izq2 = t(mu_d2) %*% Spi2 %*% X0 - (1 / 2) * t(mu_d2) %*% Spi2 %*% mu_s2
model_der2 = log((C12a / C21a) * (P2a / P1a))
ifelse(model_izq2 >= model_der2,
'enferma', 'sana')
## [,1]
## [1,] "enferma"
y_test[1,]
Modelos review
data_1clas12 = rbind(X_train2, X02)
g_1clas2 = c(y_train$g, 3)
plot3d(data_1clas12, type='s',
col=g_1clas2)
Prediccion
part12 = t(mu_d2) %*% Spi2 %*% t(X_test2)
part22 = (1 / 2) * t(mu_d2) %*% Spi2 %*% mu_s2
mod_izq2 = part12 - part22[1,1]
y_pred3 = sapply(mod_izq2, function(x0_i){
ifelse(x0_i >= model_der2,
'enferma', 'sana')
})
Tabla
tbl2 = table(y_test$g, y_pred3)
tbl2
## y_pred3
## enferma sana
## enferma 7 1
## sana 1 9
sum(diag(tbl2)) / sum(tbl2)
## [1] 0.8888889