Analisis de variables cualitativas (prueba de hipotesis ejercicios)

Author

Angel YGG

Importar bases

library(MASS)
data("birthwt")
library(readr)
blood_storage <- read_csv("C:/Users/David/Downloads/blood_storage.csv")

New names:
Rows: 316 Columns: 21
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," dbl
(21): ...1, RBC.Age.Group, Median.RBC.Age, Age, AA, FamHx, PVol, TVol, T...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

library("RColorBrewer")

Resolución ejercicio 12.1

#Tabla de contingencia de variables a evaluar
tabx1 <- table(birthwt$race, birthwt$low)
#Grafico con proporciones
barplot(prop.table(tabx1,2), 
        legend= rownames(tabx1), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Bajo peso","Peso estandar"),
        col=brewer.pal(n = 3, name = "Accent"))

#Test de chisquare
chisq.test(tabx1)


    Pearson's Chi-squared test

data:  tabx1
X-squared = 5.0048, df = 2, p-value = 0.08189

tabx2 <-chisq.test(tabx1)
tabx2$observed

tabx2$expected

   
           0         1
  1 66.03175 29.968254
  2 17.88360  8.116402
  3 46.08466 20.915344

Resolución ejercicio 12.2

tabx1.1 <- table(birthwt$ht, birthwt$low)
#Grafico con proporciones
barplot(prop.table(tabx1.1,2), 
        legend= rownames(tabx1.1), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Bajo peso","Peso estandar"),
        col=brewer.pal(n = 3, name = "Accent"))

#Test de F de Fisher
fisher.test(tabx1.1, conf.int = T, conf.level = 0.95)


    Fisher's Exact Test for Count Data

data:  tabx1.1
p-value = 0.05161
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  0.8679484 13.9894703
sample estimates:
odds ratio 
  3.340866

Resolución de ejercicio 12.3

#Tabla de contingencia de variables a evaluar
tabx1.3 <- table(birthwt$race, birthwt$smoke)
#Grafico con proporciones
barplot(prop.table(tabx1.3,2), 
        legend= rownames(tabx1.3), 
        beside= T, 
        ylab = "Proporción", 
        names= c("No fuma","Fumador"),
        col=brewer.pal(n = 3, name = "Accent"))

#Test de chisquare
chisq.test(tabx1.3)


    Pearson's Chi-squared test

data:  tabx1.3
X-squared = 21.779, df = 2, p-value = 1.865e-05

tabx2.3 <-chisq.test(tabx1.3)
tabx2.3$observed

tabx2.3$expected

   
           0        1
  1 58.41270 37.58730
  2 15.82011 10.17989
  3 40.76720 26.23280

Resolución ejercicio 12.4

#Tabla de contingencia de variables a evaluar
tabx1.4 <- table(birthwt$race, birthwt$low)
#Grafico con proporciones
barplot(prop.table(tabx1.4,2), 
        legend= rownames(tabx1.4), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Bajo peso","Peso estandar"),
        col=brewer.pal(n = 3, name = "Accent"))

#Test de F de Fisher
fisher.test(tabx1.4, conf.int = T, conf.level = 0.95)


    Fisher's Exact Test for Count Data

data:  tabx1.4
p-value = 0.07889
alternative hypothesis: two.sided

#conjunto de variables 2

tabx1.1 <- table(birthwt$ht, birthwt$low)
#Grafico con proporciones
barplot(prop.table(tabx1.1,2), 
        legend= rownames(tabx1.1), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Bajo peso","Peso estandar"),
        col=brewer.pal(n = 3, name = "Accent"))

#Test de F de Fisher
fisher.test(tabx1.1, conf.int = T, conf.level = 0.95)


    Fisher's Exact Test for Count Data

data:  tabx1.1
p-value = 0.05161
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
  0.8679484 13.9894703
sample estimates:
odds ratio 
  3.340866

#conjunto de variables 3
#Tabla de contingencia de variables a evaluar
tabx1.5 <- table(birthwt$race, birthwt$smoke)
#Grafico con proporciones
barplot(prop.table(tabx1.5,2), 
        legend= rownames(tabx1.5), 
        beside= T, 
        ylab = "Proporción", 
        names= c("No fuma","Fumador"),
        col=brewer.pal(n = 3, name = "Accent"))

#Test de F de Fisher
fisher.test(tabx1.5, conf.int = T, conf.level = 0.95)


    Fisher's Exact Test for Count Data

data:  tabx1.5
p-value = 9.799e-06
alternative hypothesis: two.sided

Resolución ejercicio 12.5

mosaicplot(prop.table(table(blood_storage$AA,
                            blood_storage$Recurrence)),
           main="Relación del grupo etnico y remision del cancer de prostata", 
           col=brewer.pal(n = 3, name = "Accent"))

#Una vez observado el grafico de mosaico se puede observar una aparente relacion

#Prueba de diferencias entre varaibles cualitativas
tabx1.6 <- table(blood_storage$AA, blood_storage$Recurrence)
#Grafico con proporciones
barplot(prop.table(tabx1.6,2), 
        legend= rownames(tabx1.6), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Otra raza","Afroamericano"), 
        col = c(2:3))

chisq.test(tabx1.6)


    Pearson's Chi-squared test with Yates' continuity correction

data:  tabx1.6
X-squared = 2.6134, df = 1, p-value = 0.106

tabx2.6 <-chisq.test(tabx1.6)
tabx2$observed

tabx2$expected

   
           0         1
  1 66.03175 29.968254
  2 17.88360  8.116402
  3 46.08466 20.915344

#Variables recurrencia y edad de muestra de sangre

tabx1.7 <- table(blood_storage$RBC.Age.Group, blood_storage$Recurrence)
#Grafico con proporciones
barplot(prop.table(tabx1.7,2), 
        legend= rownames(tabx1.7), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Sin remisión", "Remision"), 
        col=brewer.pal(n = 3, name = "Accent"))

#prueba se Shi cuadrada
chisq.test(tabx1.7)


    Pearson's Chi-squared test

data:  tabx1.7
X-squared = 0.082401, df = 2, p-value = 0.9596

tabx2.7 <-chisq.test(tabx1.7)
tabx2.7$observed

tabx2.7$expected

   
           0        1
  1 87.88608 18.11392
  2 85.39873 17.60127
  3 88.71519 18.28481

#Variables tamaño del tumor y etnia
tabx1.8 <- table(blood_storage$AA, blood_storage$TVol)
#Grafico con proporciones
barplot(prop.table(tabx1.8,2), 
        legend= rownames(tabx1.8), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Bajo", "Medio","Alto"), 
        col= c(2:3))

chisq.test(tabx1.8)


    Pearson's Chi-squared test

data:  tabx1.8
X-squared = 1.3273, df = 2, p-value = 0.515

tabx2.8 <-chisq.test(tabx1.8)
tabx2.8$observed

   
      1   2   3
  0  56 124  77
  1   8  29  16

tabx2.8$expected

   
           1         2    3
  0 53.05806 126.84194 77.1
  1 10.94194  26.15806 15.9

Resolucion ejercicio 12.6

Recodiciacion de variables “Melanoma”

data("Melanoma")
#recodificacion de variables
Melanoma$status <- factor(Melanoma$status, 
                                      
                          levels = c(1,2,3),
                                      
                          labels = c("Bajo", "Medio","Avanzado"))
                         
Melanoma$sex <- factor(Melanoma$sex, 
                         levels = c(0,1),
                         labels = c("mujer", "hombre"))


Melanoma$ulcer <- factor(Melanoma$ulcer, 
                          levels = c(0,1),
                          labels = c("sin ulcera", "con ulcera"))

#Realizar un corte hasta los 50 años en variable age

Melanoma$edad.1 <- cut(Melanoma$age, 
                            breaks = c(0, 15, 25, 35, 50),
                            labels = c("0-15", "15-25", 
                                       "25-35", "35-50"),
                            right = F, na.rm= TRUE) 

Melanoma$edad.1 <- factor(Melanoma$edad.1)

Prueba de relación entre variables

#Influencia de variable age vs status?
tabx1.9 <- table(Melanoma$status, Melanoma$edad.1)
#Grafico con proporciones
barplot(prop.table(tabx1.9,2), 
        legend= rownames(tabx1.9), 
        beside= T, 
        ylab = "Proporción", 
        names= c("0-15", "15-25","25-35","35-50"), 
        col=brewer.pal(n = 3, name = "Accent"))

chisq.test(tabx1.9)

Warning in chisq.test(tabx1.9): Chi-squared approximation may be incorrect


    Pearson's Chi-squared test

data:  tabx1.9
X-squared = 1.4145, df = 6, p-value = 0.965

tabx2.9 <-chisq.test(tabx1.9)

Warning in chisq.test(tabx1.9): Chi-squared approximation may be incorrect

tabx2.9$observed

          
           0-15 15-25 25-35 35-50
  Bajo        1     2     5    12
  Medio       2     7    14    39
  Avanzado    0     0     0     2

tabx2.9$expected

          
                 0-15     15-25     25-35     35-50
  Bajo     0.71428571 2.1428571  4.523810 12.619048
  Medio    2.21428571 6.6428571 14.023810 39.119048
  Avanzado 0.07142857 0.2142857  0.452381  1.261905

#hombres o mujeres con mayor mortalidad

tabx1.10 <- table(Melanoma$status, Melanoma$sex)
#Grafico con proporciones
barplot(prop.table(tabx1.10,2), 
        legend= rownames(tabx1.10), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Hombre", "Mujer"), 
        col=brewer.pal(n = 3, name = "Accent"))

chisq.test(tabx1.10)


    Pearson's Chi-squared test

data:  tabx1.10
X-squared = 6.793, df = 2, p-value = 0.03349

tabx2.10 <-chisq.test(tabx1.10)
tabx2.10$observed

          
           mujer hombre
  Bajo        28     29
  Medio       91     43
  Avanzado     7      7

tabx2.10$expected

          
               mujer    hombre
  Bajo     35.034146 21.965854
  Medio    82.360976 51.639024
  Avanzado  8.604878  5.395122

#Presencia o ausencia de ulceras vs status

tabx1.11 <- table(Melanoma$status, Melanoma$ulcer)
#Grafico con proporciones
barplot(prop.table(tabx1.11,2), 
        legend= rownames(tabx1.11), 
        beside= T, 
        ylab = "Proporción", 
        names= c("sin ulceras", "Con ulceras"), 
        col=brewer.pal(n = 3, name = "Accent"))

chisq.test(tabx1.11)


    Pearson's Chi-squared test

data:  tabx1.11
X-squared = 26.974, df = 2, p-value = 1.389e-06

tabx2.11 <-chisq.test(tabx1.11)
tabx2.11$observed

          
           sin ulcera con ulcera
  Bajo             16         41
  Medio            92         42
  Avanzado          7          7

tabx2.11$expected

          
           sin ulcera con ulcera
  Bajo      31.975610  25.024390
  Medio     75.170732  58.829268
  Avanzado   7.853659   6.146341

Resolucion ejercicio 12.7

Importar dataframe

library(readr)
Fertility <- read_csv("C:/Users/David/Downloads/Fertility.csv")

New names:
Rows: 254654 Columns: 9
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(6): morekids, gender1, gender2, afam, hispanic, other dbl (3): ...1, age, work
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

Relación variable de fertilidad con otras cualitativas

#cambio de estructura de la variable
Fertility$morekids <- factor(Fertility$morekids)
Fertility$afam <- factor(Fertility$afam)
Fertility$hispanic <- factor(Fertility$hispanic)
Fertility$other <- factor(Fertility$other)

#Fertilidad vs grupo etnico
#afroamericano
tabx1.12 <- table(Fertility$morekids, Fertility$afam)
#Grafico con proporciones
barplot(prop.table(tabx1.12,2), 
        legend= rownames(tabx1.12), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Otra raza", "Afroamericano"), 
        col= c(2:3))

chisq.test(tabx1.12)


    Pearson's Chi-squared test with Yates' continuity correction

data:  tabx1.12
X-squared = 352.23, df = 1, p-value < 2.2e-16

tabx2.12 <-chisq.test(tabx1.12)
tabx2.12$observed

     
          no    yes
  no  150611   7131
  yes  90887   6025

tabx2.12$expected

     
             no      yes
  no  149592.69 8149.307
  yes  91905.31 5006.693

#origen hispano
tabx1.13 <- table(Fertility$morekids, Fertility$hispanic)
#Grafico con proporciones
barplot(prop.table(tabx1.13,2), 
        legend= rownames(tabx1.13), 
        beside= T, 
        ylab = "Proporción", 
        names= c("Otra raza", "Hispano"), 
        col= c(2:3))

chisq.test(tabx1.13)


    Pearson's Chi-squared test with Yates' continuity correction

data:  tabx1.13
X-squared = 1536.1, df = 1, p-value < 2.2e-16

tabx2.13 <-chisq.test(tabx1.13)
tabx2.13$observed

     
          no    yes
  no  148554   9188
  yes  87203   9709

tabx2.13$expected

     
             no       yes
  no  146036.51 11705.493
  yes  89720.49  7191.507