## Loading required package: ggplot2

Get to know the dataset

head(esoph)
##   agegp     alcgp    tobgp ncases ncontrols
## 1 25-34 0-39g/day 0-9g/day      0        40
## 2 25-34 0-39g/day    10-19      0        10
## 3 25-34 0-39g/day    20-29      0         6
## 4 25-34 0-39g/day      30+      0         5
## 5 25-34     40-79 0-9g/day      0        27
## 6 25-34     40-79    10-19      0         7
nrow(esoph)
## [1] 88
ncol(esoph)
## [1] 5

Giving Descriptive Names to the Columns

colnames(esoph) <- c("AgeGroup", "AlcoholConsumption", "TobaccoConsumption", "Numberofcases", "Numberofcontrols")
head(esoph)
##   AgeGroup AlcoholConsumption TobaccoConsumption Numberofcases
## 1    25-34          0-39g/day           0-9g/day             0
## 2    25-34          0-39g/day              10-19             0
## 3    25-34          0-39g/day              20-29             0
## 4    25-34          0-39g/day                30+             0
## 5    25-34              40-79           0-9g/day             0
## 6    25-34              40-79              10-19             0
##   Numberofcontrols
## 1               40
## 2               10
## 3                6
## 4                5
## 5               27
## 6                7
# Exploratory Data Analysis Questions 1: What age groupd have the highest risk for esophageal cancer? 
X <- subset(esoph, select = c("AgeGroup", "Numberofcases"))
head(X)
##   AgeGroup Numberofcases
## 1    25-34             0
## 2    25-34             0
## 3    25-34             0
## 4    25-34             0
## 5    25-34             0
## 6    25-34             0
plot(X, main = "Esophageal cancer risks by age group")

# Exploratory Data Analysis Questions 2: Which alcohol consumption range cause higher risks for esophageal cancer?
Y <- subset(esoph, select = c("AlcoholConsumption", "TobaccoConsumption", "Numberofcases"))
head(Y)
##   AlcoholConsumption TobaccoConsumption Numberofcases
## 1          0-39g/day           0-9g/day             0
## 2          0-39g/day              10-19             0
## 3          0-39g/day              20-29             0
## 4          0-39g/day                30+             0
## 5              40-79           0-9g/day             0
## 6              40-79              10-19             0
heights <- tapply(Y$`Numberofcases`, Y$`AlcoholConsumption`, mean)
barplot(heights, main = "Mean number of cases by alcohol consumption", 
        names.arg = c("0-39 g/day", "40-79 g/day", "80-119 g/day", "120+ g/day"),
        ylab = "Number of cases")

# Exploratory Data Analysis Questions 3: Which Tobacco Consumption range cause higher risks for esophageal cancer?
heights2 <- tapply(Y$`Numberofcases`, Y$`TobaccoConsumption`, mean)
barplot(heights2, main = "Mean number of cases by tobacco consumption",
        names.arg = c("0-9 g/day", "10-19 g/day", "20-29 g/day", "30+ g/day"),
        ylab = "Number of cases")

# conclusion 1: Age groupd from 55-64 have the higher number of cases for esophageal cancer.
# Conclusion 2: Alcohol Consumption from 40-79 g/day has the highest average number of cases for esophageal cancer
# Conclusion 3: Tobacco Consumption from 0-9 g/day has the highest average number of cases for esophageal cancer.