library(readr)
db <- read_csv("~/Desktop/Carpetas/Coursera_Statistics/Curso_Imperial_College_London/data_base.csv")
db
## # A tibble: 66 × 9
## patient_id age gender bmi smoking exercise fruit veg cancer
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 61 0 20.8 2 0 1 2 0
## 2 2 68 1 27.3 0 0 0 1 0
## 3 3 62 1 22.2 0 0 1 3 0
## 4 4 61 1 35.3 2 0 2 4 0
## 5 5 58 1 22.7 1 0 3 1 0
## 6 6 46 1 27.7 1 0 0 2 1
## 7 7 67 1 29.0 2 0 0 3 1
## 8 8 68 0 19.8 0 0 3 4 0
## 9 9 53 1 17.6 0 0 1 3 0
## 10 10 59 0 27.7 2 0 1 3 0
## # … with 56 more rows
head(db)
## # A tibble: 6 × 9
## patient_id age gender bmi smoking exercise fruit veg cancer
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 61 0 20.8 2 0 1 2 0
## 2 2 68 1 27.3 0 0 0 1 0
## 3 3 62 1 22.2 0 0 1 3 0
## 4 4 61 1 35.3 2 0 2 4 0
## 5 5 58 1 22.7 1 0 3 1 0
## 6 6 46 1 27.7 1 0 0 2 1
#Summary statistics
summary(db$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39.00 54.00 61.00 61.02 69.75 89.00
table(db$gender)
##
## 0 1
## 33 33
summary(db$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.81 21.41 24.78 24.22 27.30 40.62
table(db$smoking)
##
## 0 1 2
## 26 18 21
table(db$exercise)
##
## 0 1 2
## 22 22 22
table(db$veg)
##
## 0 1 2 3 4 5 6 9
## 2 11 15 18 12 3 2 3
table(db$fruit)
##
## 0 1 2 3 4
## 24 24 12 5 1
table(db$cancer)
##
## 0 1
## 51 15
db$healthy_BMI <- ifelse(db$bmi > 18.5 & db$bmi < 25, 1, 0)
table(db$healthy_BMI)
##
## 0 1
## 40 26
#histograma
hist(db$age)
hist(db$bmi)
fruitveg <- db$fruit + db$veg
table(fruitveg)
## fruitveg
## 0 1 2 3 4 5 6 7 8 10 11
## 1 8 4 15 16 10 6 2 1 2 1
hist(fruitveg)
class(fruitveg)
## [1] "numeric"
fruitveg_num <- as.numeric(unlist(fruitveg))
hist(fruitveg_num)
hist(fruitveg_num, xlab = "Portions of fruit and vegetables",
main = "Daily consumption of fruit and vegetables combined",
col = "green")
hist(fruitveg_num, xlab = "Portions of fruit and vegetables",
main = "Daily consumption of fruit and vegetables combined", axes = F,
col = "green")
axis(side = 1, at = seq(0, 11, 1))
axis(side = 2, at = seq(0, 16, 2))
#hypothesis tests
#chi square
db$"five_a_day" <- db$fruit + db$veg #we create new variable "five our more fruits or vegetables a day".
db$"five_a_day1" <- db$five_a_day
db$"five_a_day1" <- replace(db$five_a_day1, db$five_a_day < 5, 0)
db$"five_a_day1" <- replace(db$five_a_day1, db$five_a_day > 4, 1)
chisq.test(x = db$five_a_day1, y = db$cancer)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: db$five_a_day1 and db$cancer
## X-squared = 2.4265, df = 1, p-value = 0.1193
db$"over_w" <- NA
db$"over_w" <- replace(db$over_w, db$bmi < 26, 0)
db$"over_w" <- replace(db$over_w, db$bmi > 25, 1)
chisq.test(x = db$over_w, y = db$cancer)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: db$over_w and db$cancer
## X-squared = 0.20625, df = 1, p-value = 0.6497
#Answer1: no association between fruit & vegetable consumption and cancer.
#Answer2: no association between overweight and cancer.
#t-student test
t.test(db$bmi ~ db$cancer)#Welch’s version
##
## Welch Two Sample t-test
##
## data: db$bmi by db$cancer
## t = 0.90034, df = 21.878, p-value = 0.3777
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.735200 4.396204
## sample estimates:
## mean in group 0 mean in group 1
## 24.5198 23.1893
t.test(db$bmi ~ db$cancer, var.equal=TRUE)
##
## Two Sample t-test
##
## data: db$bmi by db$cancer
## t = 0.92959, df = 64, p-value = 0.3561
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.528819 4.189823
## sample estimates:
## mean in group 0 mean in group 1
## 24.5198 23.1893
#Answer3: no statistical differences in BMI means among patients with and without cancer.