1. Revisar los datos

En la tabla de datos disponible aquí se encuentran datos de las estaturas de un grupo anterior de la especialización en estadística aplicada

library(readxl)
library(DT)
library(tigerstats)

2. Estimación de parámetros

Estime la media, desviación estándar de cada género y la proporción de hombres y mujeres del curso.

BaseEstaturaGenero <- read_excel("BaseEstaturaGenero.xlsx")
DT::datatable(BaseEstaturaGenero)
Fem<-BaseEstaturaGenero  %>% filter(Genero == "1")
Fem
## # A tibble: 6 × 2
##   Estatura Genero
##      <dbl>  <dbl>
## 1      157      1
## 2      160      1
## 3      166      1
## 4      155      1
## 5      167      1
## 6      165      1
Mas<-BaseEstaturaGenero  %>% filter(Genero == "0")
Mas
## # A tibble: 16 × 2
##    Estatura Genero
##       <dbl>  <dbl>
##  1      180      0
##  2      180      0
##  3      170      0
##  4      177      0
##  5      176      0
##  6      179      0
##  7      168      0
##  8      172      0
##  9      177      0
## 10      176      0
## 11      167      0
## 12      174      0
## 13      165      0
## 14      178      0
## 15      173      0
## 16      172      0
x_barraF=mean(Fem$Estatura)
x_barraF
## [1] 161.6667
Y_barraM=mean(Mas$Estatura)
Y_barraM
## [1] 174
desvx=sd(Fem$Estatura)
desvx
## [1] 5.046451
desvy=sd(Mas$Estatura)
desvy
## [1] 4.690416
Proporcion_M=length(Fem$Estatura)
Proporcion_M
## [1] 6
Proporcion_H=length(Mas$Estatura)
Proporcion_H
## [1] 16

3. Hipótesis

Escriba la hipótesis nula y alternativa de la estatura de cada género con referencia al artículo del Tiempo.

\[ H_o: \mu=\\ H_a: \]

library(tigerstats)
ttestGC(~Estatura,data = Mas, mu=172,alternative="t",graph = TRUE)
## 
## 
## Inferential Procedures for One Mean mu:
## 
## 
## Descriptive Results:
## 
## variable  mean     sd       n          
## Estatura  174.000  4.690    16         
## 
## 
## Inferential Results:
## 
## Estimate of mu:   174 
## SE(x.bar):    1.173 
## 
## 95% Confidence Interval for mu:
## 
##           lower.bound         upper.bound          
##           171.500654          176.499346           
## 
## Test of Significance:
## 
##  H_0:  mu = 172 
##  H_a:  mu != 172 
## 
##  Test Statistic:     t = 1.706 
##  Degrees of Freedom:   15 
##  P-value:        P = 0.1087

El valor P es \[P(t > 1.706) = 0.1087\]

\[ 0.1087 > 0.05\]

EXPLICACIÓN: No hay evidencia estadistica suficiente para rechazar \(H_0\)

\[ H_o:\\ H_a: \]

ttestGC(~Estatura,data = Fem, mu=160,alternative="t",graph = TRUE)
## 
## 
## Inferential Procedures for One Mean mu:
## 
## 
## Descriptive Results:
## 
## variable  mean     sd       n          
## Estatura  161.667  5.046    6          
## 
## 
## Inferential Results:
## 
## Estimate of mu:   161.7 
## SE(x.bar):    2.06 
## 
## 95% Confidence Interval for mu:
## 
##           lower.bound         upper.bound          
##           156.370741          166.962592           
## 
## Test of Significance:
## 
##  H_0:  mu = 160 
##  H_a:  mu != 160 
## 
##  Test Statistic:     t = 0.809 
##  Degrees of Freedom:   5 
##  P-value:        P = 0.4553

El valor P es \[P(t > 0.809) = 0.4553\]

\[ 0.4553 > 0.05\]

EXPLICACIÓN: No hay evidencia estadistica suficiente para rechazar \(H_0\)

Realizar los respectivos contrastes y concluir

4 .Hipótesis diferencia de estatura entre géneros

Ahora ponga a prueba la hipótesis de diferencia de altura de ambos géneros en el curso. para ver si la altura media de las mujeres es igual a la altura media de los hombres.

\[ H_o:μ_1 - μ_2= 12\\ H_a:μ_1 - μ_2> 12 \]

Concluya y escriba las consecuencias de la decisión

\[T_Est= \frac{\overline{X}-\overline{Y}-(μ_1-μ_2)}{Sp\sqrt\frac{1}{n1}+\sqrt\frac{1}{n2}} ~ t_(n_1+n_2-2)\]

Sp= ((Proporcion_H-1)*desvy+(Proporcion_M-1)*desvx)/(Proporcion_H+Proporcion_M-2)
Sp
## [1] 4.779425
Test = (Y_barraM-x_barraF-12)/(Sp*sqrt((1/Proporcion_H)+(1/Proporcion_M)))
Test
## [1] 0.1456892
qt(0.05,20,lower.tail = F)
## [1] 1.724718
t.test(Mas$Estatura, Fem$Estatura, mu=12,var.equal =F, alternative="greater", graph = TRUE)
## 
##  Welch Two Sample t-test
## 
## data:  Mas$Estatura and Fem$Estatura
## t = 0.14062, df = 8.468, p-value = 0.4457
## alternative hypothesis: true difference in means is greater than 12
## 95 percent confidence interval:
##  7.956585      Inf
## sample estimates:
## mean of x mean of y 
##  174.0000  161.6667

El valor P es \[P(t > 0.14062) = 0.4457\]

\[ 0.4457 > 0.05\]

EXPLICACIÓN: No hay evidencia estadistica suficiente para rechazar \(H_0\)

5. Cáncer de mama

Considere el conjunto de datos en Kaggle sobre Datos reales sobre el cáncer de mama aquí

library(readr)
BRCA <- read_csv("BRCA.csv")
## Rows: 341 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Patient_ID, Gender, Tumour_Stage, Histology, ER status, PR status,...
## dbl  (5): Age, Protein1, Protein2, Protein3, Protein4
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DT::datatable(BRCA)
BRCA
## # A tibble: 341 × 16
##    Patient_ID     Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage
##    <chr>        <dbl> <chr>     <dbl>    <dbl>    <dbl>    <dbl> <chr>       
##  1 TCGA-D8-A1XD    36 FEMALE   0.0804    0.426   0.547    0.274  III         
##  2 TCGA-EW-A1OX    43 FEMALE  -0.420     0.578   0.614   -0.0315 II          
##  3 TCGA-A8-A079    69 FEMALE   0.214     1.31   -0.327   -0.234  III         
##  4 TCGA-D8-A1XR    56 FEMALE   0.345    -0.211  -0.193    0.124  II          
##  5 TCGA-BH-A0BF    56 FEMALE   0.222     1.91    0.520   -0.312  II          
##  6 TCGA-AO-A1KQ    84 MALE    -0.0819    1.72   -0.0573   0.0430 III         
##  7 TCGA-D8-A73X    53 FEMALE  -0.0695    1.42   -0.361    0.392  II          
##  8 TCGA-A7-A426    50 FEMALE   0.672     1.28   -0.321   -0.112  III         
##  9 TCGA-EW-A1P5    77 FEMALE  -0.152    -0.663   1.19     0.217  II          
## 10 TCGA-A8-A09A    40 FEMALE  -0.566     1.27   -0.293    0.194  II          
## # … with 331 more rows, and 8 more variables: Histology <chr>,
## #   `ER status` <chr>, `PR status` <chr>, `HER2 status` <chr>,
## #   Surgery_type <chr>, Date_of_Surgery <chr>, Date_of_Last_Visit <chr>,
## #   Patient_Status <chr>
Femenino<- BRCA %>% filter(Gender == "FEMALE")
Femenino
## # A tibble: 330 × 16
##    Patient_ID     Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage
##    <chr>        <dbl> <chr>     <dbl>    <dbl>    <dbl>    <dbl> <chr>       
##  1 TCGA-D8-A1XD    36 FEMALE   0.0804    0.426    0.547   0.274  III         
##  2 TCGA-EW-A1OX    43 FEMALE  -0.420     0.578    0.614  -0.0315 II          
##  3 TCGA-A8-A079    69 FEMALE   0.214     1.31    -0.327  -0.234  III         
##  4 TCGA-D8-A1XR    56 FEMALE   0.345    -0.211   -0.193   0.124  II          
##  5 TCGA-BH-A0BF    56 FEMALE   0.222     1.91     0.520  -0.312  II          
##  6 TCGA-D8-A73X    53 FEMALE  -0.0695    1.42    -0.361   0.392  II          
##  7 TCGA-A7-A426    50 FEMALE   0.672     1.28    -0.321  -0.112  III         
##  8 TCGA-EW-A1P5    77 FEMALE  -0.152    -0.663    1.19    0.217  II          
##  9 TCGA-A8-A09A    40 FEMALE  -0.566     1.27    -0.293   0.194  II          
## 10 TCGA-S3-A6ZG    71 FEMALE  -0.223     0.506   -0.349  -0.835  II          
## # … with 320 more rows, and 8 more variables: Histology <chr>,
## #   `ER status` <chr>, `PR status` <chr>, `HER2 status` <chr>,
## #   Surgery_type <chr>, Date_of_Surgery <chr>, Date_of_Last_Visit <chr>,
## #   Patient_Status <chr>
mean(Femenino$Age)
## [1] 58.85152
median(Femenino$Age)
## [1] 58
length(Femenino$Age)
## [1] 330
sd(Femenino$Age)
## [1] 12.92293
max(Femenino$Age)
## [1] 90
min(Femenino$Age)
## [1] 29
summary(Femenino)
##   Patient_ID             Age           Gender             Protein1        
##  Length:330         Min.   :29.00   Length:330         Min.   :-2.340900  
##  Class :character   1st Qu.:49.00   Class :character   1st Qu.:-0.361740  
##  Mode  :character   Median :58.00   Mode  :character   Median : 0.004813  
##                     Mean   :58.85                      Mean   :-0.039322  
##                     3rd Qu.:67.75                      3rd Qu.: 0.336462  
##                     Max.   :90.00                      Max.   : 1.593600  
##     Protein2          Protein3           Protein4        Tumour_Stage      
##  Min.   :-0.9787   Min.   :-1.62740   Min.   :-2.02550   Length:330        
##  1st Qu.: 0.3518   1st Qu.:-0.51375   1st Qu.:-0.38120   Class :character  
##  Median : 0.9928   Median :-0.17318   Median : 0.04340   Mode  :character  
##  Mean   : 0.9468   Mean   :-0.08907   Mean   : 0.01081                     
##  3rd Qu.: 1.6279   3rd Qu.: 0.28093   3rd Qu.: 0.43440                     
##  Max.   : 3.4022   Max.   : 2.19340   Max.   : 1.62990                     
##   Histology          ER status          PR status         HER2 status       
##  Length:330         Length:330         Length:330         Length:330        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Surgery_type       Date_of_Surgery    Date_of_Last_Visit Patient_Status    
##  Length:330         Length:330         Length:330         Length:330        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
library(ggplot2)
ggplot(Femenino, aes(x = Protein2, y = Age, color=Surgery_type, size=Tumour_Stage)) +
  geom_point() + 
  scale_x_log10()
## Warning: Using size for a discrete variable is not advised.
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 60 rows containing missing values (geom_point).

Masculino<- BRCA %>% filter(Gender == "MALE")
Masculino
## # A tibble: 4 × 16
##   Patient_ID     Age Gender Protein1 Protein2 Protein3 Protein4 Tumour_Stage
##   <chr>        <dbl> <chr>     <dbl>    <dbl>    <dbl>    <dbl> <chr>       
## 1 TCGA-AO-A1KQ    84 MALE    -0.0819    1.72   -0.0573   0.0430 III         
## 2 TCGA-AQ-A54O    51 MALE     0.838     0.506  -0.545    0.315  II          
## 3 TCGA-AR-A1AV    68 MALE     1.47      0.469   0.136   -0.290  II          
## 4 TCGA-A8-A085    44 MALE     0.733     1.11   -0.270   -0.355  II          
## # … with 8 more variables: Histology <chr>, `ER status` <chr>,
## #   `PR status` <chr>, `HER2 status` <chr>, Surgery_type <chr>,
## #   Date_of_Surgery <chr>, Date_of_Last_Visit <chr>, Patient_Status <chr>
mean(Masculino$Age)
## [1] 61.75
median(Masculino$Age)
## [1] 59.5
length(Masculino$Age)
## [1] 4
sd(Masculino$Age)
## [1] 17.93274
max(Masculino$Age)
## [1] 84
min(Masculino$Age)
## [1] 44
summary(Masculino)
##   Patient_ID             Age           Gender             Protein1       
##  Length:4           Min.   :44.00   Length:4           Min.   :-0.08187  
##  Class :character   1st Qu.:49.25   Class :character   1st Qu.: 0.52907  
##  Mode  :character   Median :59.50   Mode  :character   Median : 0.78531  
##                     Mean   :61.75                      Mean   : 0.73978  
##                     3rd Qu.:72.00                      3rd Qu.: 0.99602  
##                     Max.   :84.00                      Max.   : 1.47040  
##     Protein2         Protein3            Protein4        Tumour_Stage      
##  Min.   :0.4690   Min.   :-0.545130   Min.   :-0.35492   Length:4          
##  1st Qu.:0.4971   1st Qu.:-0.338423   1st Qu.:-0.30611   Class :character  
##  Median :0.8091   Median :-0.163428   Median :-0.12341   Mode  :character  
##  Mean   :0.9528   Mean   :-0.184111   Mean   :-0.07171                     
##  3rd Qu.:1.2648   3rd Qu.:-0.009116   3rd Qu.: 0.11099                     
##  Max.   :1.7241   Max.   : 0.135540   Max.   : 0.31490                     
##   Histology          ER status          PR status         HER2 status       
##  Length:4           Length:4           Length:4           Length:4          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Surgery_type       Date_of_Surgery    Date_of_Last_Visit Patient_Status    
##  Length:4           Length:4           Length:4           Length:4          
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 
library(ggplot2)
ggplot(Masculino, aes(x = Protein1, y = Age, color=Surgery_type, size=Patient_Status)) +
  geom_point() + 
  scale_x_log10()
## Warning: Using size for a discrete variable is not advised.
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1 rows containing missing values (geom_point).