library("ggplot2")
# The data set used
SmokeBan <- read.csv("https://vincentarelbundock.github.io/Rdatasets/csv/AER/SmokeBan.csv", header = TRUE)
# I WILL CREATE A SCATTER PLOT BETWEEN THE EMPLOYEE AGE AND THE NUMBER OF YEARS SMOKED.
# Create a subset called "afam" of the African American smoking population.
afam <- subset(SmokeBan,smoker=="yes" & afam=="yes",select=c(gender,ban,age))
# Adding a column to the subset called: years_smoked
afam$years_smoked<-sample(1:25, size = 187, replace = T)
# Scatter Plot of the between age and years_smoked using gender as a factor
ScatterPlot <- ggplot(afam, aes(x = age, y = years_smoked, color = factor(gender)))+
geom_point(size=2.5)
ScatterPlot

# ANALYSIS:
#The majority of African American employees both male and female who smoked the most amount of years were between the age of 25 and 45. Between this age group as the age goes up the number of years smoked goes up.
# I WILL CREATE A BOX PLOT BETWEEN AGE AND GENDER OF SMOKING EMPLOYEES
# Create a subset of smoking employees.
semp <- subset(SmokeBan,smoker=="yes" & afam=="yes",select=c(gender,age))
# Bar Plot comparing the age and gender of smoking employees
plot=ggplot(data=semp, mapping=aes(x=gender, y=age))+geom_boxplot()
plot

# ANALYSIS:
#It looks like male employees started smoking at an earlier age than female employees and smoked for a longer time than female employees.
# I WILL CREATE A HISTOGRAM FOR AGE on NON SMOKING EMPLOYEES
# Create a subset of non-smoking employees.
ns <- subset(SmokeBan,smoker=="no" ,select=c(age))
hist(ns$age, main = "Age Histogram For Non-Smokers", xlab = "Age")

# ANALYSIS:
#It looks like the most number of employees who don't smoke are in the age group of 35. As the age decreases the frequency decreases as well. These do facts have to do with the fact that the majority of employees are from the age of 25 to the age 45. That is why we see a significant decrease of frequency for no-smoking employees after the age of55.
# I WILL CREATE A BAR PLOT OF THE AGES OF THE SMOKING EMPLOYEES. THIS IS DONE TO SEE THE COUNT OF SMOKING EMPLOYEES BY AGE.
# Create a subset of smoking employees.
se <- subset(SmokeBan,smoker=="yes",select=c(age))
barplot(table(se),
main="Age Count of Smoking Employees",
xlab="Age",
ylab="Count",
border="red",
col="blue",
density=10
)

# ANALYSIS:
#It looks like the most number of employees who smoke are 35 years old. From the graph, we can see that the ban should be imposed to the age group of 30 to 50 year olds.