Hipotesis desbalanceados

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(pander)
library(readxl)

##Hipotesis nula para modelos balanceados

\[ H_0: \mu_1 = \mu_2 =.... =\mu_n \\\mu_a = H_0 \text {es falsa}\] modelo de diseño \[ Y_{ij}=\\m\mu_i + \epsilon_{ij}\\i = 1...9\\j=1...r\]

P-valor del estudio

set.seed(123)
malato = rnorm(120, 10, 1.2)
vino = gl(3,40,120,c('v1','v2','v3'))
datos = data.frame(Malato = malato, Vino = vino)

mod1 = aov(datos$Malato~datos$Vino)
summary(mod1)

##              Df Sum Sq Mean Sq F value Pr(>F)
## datos$Vino    2   0.08  0.0413   0.035  0.965
## Residuals   117 137.01  1.1710

EJ1 <- read_excel("C:\\Users\\yarvis\\Downloads\\EJ1 (1).xlsx")

dfe1 = read_excel("C:\\Users\\yarvis\\Downloads\\EJ1 (1).xlsx",    sheet = "isolation")

dfe1=data.frame(EJ1)
df1=dfe1[,-1]
summary(df1)

##      INC             PROF.RAIZ            ALTURA         
##  Length:50          Length:50          Length:50         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

df1$ALTURA=as.numeric(df1$ALTURA)
df1$PROF.RAIZ = as.numeric(df1$PROF.RAIZ)
df1$INC=as.numeric(df1$INC)
model1 = glm(df1$INC ~ df1$PROF.RAIZ + df1$ALTURA, family = binomial)
summary(model1)

## 
## Call:
## glm(formula = df1$INC ~ df1$PROF.RAIZ + df1$ALTURA, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8189  -0.3089   0.0490   0.3635   2.1192  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     6.6417     2.9218   2.273  0.02302 * 
## df1$PROF.RAIZ   0.5807     0.2478   2.344  0.01909 * 
## df1$ALTURA     -1.3719     0.4769  -2.877  0.00401 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 68.029  on 49  degrees of freedom
## Residual deviance: 28.402  on 47  degrees of freedom
## AIC: 34.402
## 
## Number of Fisher Scoring iterations: 6

model2 = glm(df1$INC ~ df1$PROF.RAIZ + df1$ALTURA, binomial)
summary(model2)

## 
## Call:
## glm(formula = df1$INC ~ df1$PROF.RAIZ + df1$ALTURA, family = binomial)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8189  -0.3089   0.0490   0.3635   2.1192  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)     6.6417     2.9218   2.273  0.02302 * 
## df1$PROF.RAIZ   0.5807     0.2478   2.344  0.01909 * 
## df1$ALTURA     -1.3719     0.4769  -2.877  0.00401 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 68.029  on 49  degrees of freedom
## Residual deviance: 28.402  on 47  degrees of freedom
## AIC: 34.402
## 
## Number of Fisher Scoring iterations: 6

Comparacion de modelos

\[H_0: El modelo mas pequeño es el mejor \]

anova(model1, model2, test= 'Chi')

## Analysis of Deviance Table
## 
## Model 1: df1$INC ~ df1$PROF.RAIZ + df1$ALTURA
## Model 2: df1$INC ~ df1$PROF.RAIZ + df1$ALTURA
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1        47     28.402                     
## 2        47     28.402  0        0

Modelo \[ln(\frac{p_i}{1-p_i}= \beta_0+\beta_1X_1+\beta_2X_2+\epsilon_i\\ln(\frac{p_i}{1-p_i}=6.64 +0.58 PR-1.37A\\ln(\frac{p_i}{1-p_i}=exp(6.64 +0.58 PR-1.37A)\\ln(\frac{p_i}{1-p_i}=exp(6.64) *exp(0.58 PR)*exp(-1.37A)\] Probabilidades estimadas

num = 765.1*exp(0.58*df1$PROF.RAIZ) * exp(-1.37*df1$ALTURA)
den = 1 + num
pi=num/den
plot(df1$INC, pi)

pi_ = ifelse(pi>0.5,1, 0)
table(df1$INC, pi_)

##    pi_
##      0  1
##   0 17  4
##   1  3 26

(porcentaje_clasi = 100 * (17+26)/(17+26+4+3))

## [1] 86

pi_ = ifelse(pi>0.5, 1, 0)
tb = table(df1$INC, pi_) # matriz de confusion

porcentaje_clasi = 100 * sum(diag(tb))/sum(tb)
porcentaje_clasi

## [1] 86

se clasifico el 86% de los datos

pi_2 = ifelse(pi>0.55,1, 0)
tb = table(df1$INC, pi_2) # matriz de confución

porcentaje_clasi2 = 100 * sum(diag(tb))/sum(tb)
porcentaje_clasi2

## [1] 84

El modelon no acerto en 7 datos

(porcentaje_clasi2 = 100 * (17+26)/(17+26+4+3))

## [1] 86

porcentaje = c()
cont =1
matrix = list()
for(i in seq(0.01, 0.99, 0.01)){
  pi_ = ifelse(pi>i,1,0)
  tb = table(dfe1$INC, pi_)
  pr = 100*sum(diag(tb))/sum(tb)
  porcentaje[cont] = pr
  cont = cont +1
  
}

plot(seq(0.01, 0.99, 0.01), porcentaje, pch = 19 , cex = 0.8,ylab = 'Porcentaje de clasificación', xlab = 'Valor punto de corte')
abline(v = c(0.44, 0.5), col = c('red', 'blue'))

punto de mayor clasificacion (pi)

seq(0.01,0.99,0.01)[which.max(porcentaje)]

## [1] 0.43

punto de corte para mayor clasificación : 0.43

Hipotesis desbalanceados

Ruth Jeanneth Perez Vallejo

30/11/2020