library(readr)
db <- read_csv("~/Desktop/Carpetas/Coursera_Statistics/Curso_Imperial_College_London/data_base.csv")
db
## # A tibble: 66 × 9
##    patient_id   age gender   bmi smoking exercise fruit   veg cancer
##         <dbl> <dbl>  <dbl> <dbl>   <dbl>    <dbl> <dbl> <dbl>  <dbl>
##  1          1    61      0  20.8       2        0     1     2      0
##  2          2    68      1  27.3       0        0     0     1      0
##  3          3    62      1  22.2       0        0     1     3      0
##  4          4    61      1  35.3       2        0     2     4      0
##  5          5    58      1  22.7       1        0     3     1      0
##  6          6    46      1  27.7       1        0     0     2      1
##  7          7    67      1  29.0       2        0     0     3      1
##  8          8    68      0  19.8       0        0     3     4      0
##  9          9    53      1  17.6       0        0     1     3      0
## 10         10    59      0  27.7       2        0     1     3      0
## # … with 56 more rows
head(db)
## # A tibble: 6 × 9
##   patient_id   age gender   bmi smoking exercise fruit   veg cancer
##        <dbl> <dbl>  <dbl> <dbl>   <dbl>    <dbl> <dbl> <dbl>  <dbl>
## 1          1    61      0  20.8       2        0     1     2      0
## 2          2    68      1  27.3       0        0     0     1      0
## 3          3    62      1  22.2       0        0     1     3      0
## 4          4    61      1  35.3       2        0     2     4      0
## 5          5    58      1  22.7       1        0     3     1      0
## 6          6    46      1  27.7       1        0     0     2      1

#Summary statistics

summary(db$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   39.00   54.00   61.00   61.02   69.75   89.00
table(db$gender)
## 
##  0  1 
## 33 33
summary(db$bmi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.81   21.41   24.78   24.22   27.30   40.62
table(db$smoking)
## 
##  0  1  2 
## 26 18 21
table(db$exercise)
## 
##  0  1  2 
## 22 22 22
table(db$veg)
## 
##  0  1  2  3  4  5  6  9 
##  2 11 15 18 12  3  2  3
table(db$fruit)
## 
##  0  1  2  3  4 
## 24 24 12  5  1
table(db$cancer)
## 
##  0  1 
## 51 15
db$healthy_BMI <- ifelse(db$bmi > 18.5 & db$bmi < 25, 1, 0)
table(db$healthy_BMI)
## 
##  0  1 
## 40 26

#histograma

hist(db$age)

hist(db$bmi)

fruitveg <- db$fruit + db$veg
table(fruitveg)
## fruitveg
##  0  1  2  3  4  5  6  7  8 10 11 
##  1  8  4 15 16 10  6  2  1  2  1
hist(fruitveg)

class(fruitveg)
## [1] "numeric"
fruitveg_num <- as.numeric(unlist(fruitveg))
hist(fruitveg_num)

hist(fruitveg_num, xlab = "Portions of fruit and vegetables",
     main = "Daily consumption of fruit and vegetables combined",
     col = "green")

hist(fruitveg_num, xlab = "Portions of fruit and vegetables",
     main = "Daily consumption of fruit and vegetables combined", axes = F,
     col = "green")
axis(side = 1, at = seq(0, 11, 1))
axis(side = 2, at = seq(0, 16, 2))

#hypothesis tests

#chi square

db$"five_a_day" <- db$fruit + db$veg #we create new variable "five our more fruits or vegetables a day".
db$"five_a_day1" <- db$five_a_day
db$"five_a_day1" <- replace(db$five_a_day1, db$five_a_day < 5, 0)
db$"five_a_day1" <- replace(db$five_a_day1, db$five_a_day > 4, 1)

chisq.test(x = db$five_a_day1, y = db$cancer)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  db$five_a_day1 and db$cancer
## X-squared = 2.4265, df = 1, p-value = 0.1193
db$"over_w" <- NA
db$"over_w" <- replace(db$over_w, db$bmi < 26, 0)
db$"over_w" <- replace(db$over_w, db$bmi > 25, 1)

chisq.test(x = db$over_w, y = db$cancer)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  db$over_w and db$cancer
## X-squared = 0.20625, df = 1, p-value = 0.6497

#Answer1: no association between fruit & vegetable consumption and cancer.

#Answer2: no association between overweight and cancer.

#t-student test

t.test(db$bmi ~ db$cancer)#Welch’s version
## 
##  Welch Two Sample t-test
## 
## data:  db$bmi by db$cancer
## t = 0.90034, df = 21.878, p-value = 0.3777
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.735200  4.396204
## sample estimates:
## mean in group 0 mean in group 1 
##         24.5198         23.1893
t.test(db$bmi ~ db$cancer, var.equal=TRUE) 
## 
##  Two Sample t-test
## 
## data:  db$bmi by db$cancer
## t = 0.92959, df = 64, p-value = 0.3561
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.528819  4.189823
## sample estimates:
## mean in group 0 mean in group 1 
##         24.5198         23.1893

#Answer3: no statistical differences in BMI means among patients with and without cancer.