library(readr)
The_Cancer_data_1500_V2<-read_csv("The_Cancer_data_1500_V2.csv")
head(The_Cancer_data_1500_V2)
## # A tibble: 6 × 9
##     Age Gender   BMI Smoking GeneticRisk PhysicalActivity AlcoholIntake
##   <dbl>  <dbl> <dbl>   <dbl>       <dbl>            <dbl>         <dbl>
## 1    58      1  16.1       0           1             8.15          4.15
## 2    71      0  30.8       0           1             9.36          3.52
## 3    48      1  38.8       0           2             5.14          4.73
## 4    34      0  30.0       0           0             9.50          2.04
## 5    62      1  35.5       0           0             5.36          3.31
## 6    27      0  37.1       0           1             3.94          2.32
## # ℹ 2 more variables: CancerHistory <dbl>, Diagnosis <dbl>

Independencia

library(dplyr)
library(gtsummary)

contingency_table <- The_Cancer_data_1500_V2 %>%
  select(Smoking, Diagnosis) %>%
  table()
print(contingency_table)
##        Diagnosis
## Smoking   0   1
##       0 762 334
##       1 181 223
chisq.test(contingency_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  contingency_table
## X-squared = 76.237, df = 1, p-value < 2.2e-16

Correlación

plot(The_Cancer_data_1500_V2$Age, The_Cancer_data_1500_V2$PhysicalActivity,  main = "Relación entre Edad y Actividad Física",
     xlab = "Edad",
     ylab = "Horas de Actividad Física por Semana",
     pch = 16,              
     col = rgb(0, 0, 1, 0.6))
     grid() 

cor.test(The_Cancer_data_1500_V2$Age, The_Cancer_data_1500_V2$PhysicalActivity, method = "spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  The_Cancer_data_1500_V2$Age and The_Cancer_data_1500_V2$PhysicalActivity
## S = 553698029, p-value = 0.5448
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.01564751

Normalidad

hist(The_Cancer_data_1500_V2$Age,
     main = "Histograma de Edad",
     xlab = "Edad",
     ylab = "Frecuencia",
     col = "blue",
     border = "black",
     breaks = 10)

plot(density(The_Cancer_data_1500_V2$Age),
     main = "Gráfica de Densidad de Edad",
     xlab = "Edad",
     ylab = "Densidad",
     col = "blue")

ks.test(The_Cancer_data_1500_V2$Age, "pnorm", mean = mean(The_Cancer_data_1500_V2$Age),sd(The_Cancer_data_1500_V2$Age))
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  The_Cancer_data_1500_V2$Age
## D = 0.071622, p-value = 4.146e-07
## alternative hypothesis: two-sided