# DATA 606 Week 1 Lab 1 - Introduction to Data
# Student Name: Kalyan (Kalyanaraman Parthasarathy)

# load library
library(plyr)

# Reading the CDC data from the source as CSV file. 
# I had difficulty in getting the data installed in my system 
# so directly getting it from the web
CDCData = read.csv("http://www.openintro.org/stat/data/cdc.csv")

names(CDCData)

# Exercise 1 - How many cases are there in this data set? 
# How many variables? For each variable, 
# identify its data type (e.g. categorical, discrete).

# How many cases are there in the data set - 20,000
nrow(CDCData)

# How many variables - 9
ncol(CDCData)

# For each variable, identify its data type (e.g. categorical, discrete).
head(CDCData)

# genhlth -->  Categorical
# exerany -->  Categorical
# hlthplan -->  Categorical
# smoke100 -->  Categorical
# height  -->  Numeric (Continuous)
# weight  -->  Numeric (Continuous)
# wtdesire -->  Numeric (Continuous)
# age -->  Numeric (Continuous)
# gender -->  Categorical


# Exercise 2 - Create a numerical summary for height 
# and age, and compute the interquartile range for each. 
# Compute the relative frequency distribution for gender and exerany. 
# How many males are in the sample? What proportion of 
# the sample reports being in excellent health?

# Create a numerical summary for height and age, 
# and compute the interquartile range for each.

# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 48.00   64.00   67.00   67.18   70.00   93.00 
summary(CDCData$height)

# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 18.00   31.00   43.00   45.07   57.00   99.00 
summary(CDCData$age)

# [1] 6
IQR(CDCData$height)

# [1] 26
IQR(CDCData$age)


# Compute the relative frequency distribution for gender and exerany.

# Table function is used to find the frequency distribution 
# and then divide the total population to get relative frequency distribution
# Gender
table(CDCData$gender) / nrow(CDCData)

# exerany
table(CDCData$exerany) / nrow(CDCData)


# How many males are in the sample? What proportion of 
# the sample reports being in excellent health?
# Males in the sample - 9,501
nrow(count(CDCData[CDCData$gender == "m",]))

# What proportion of the sample reports being in excellent health?
nrow(count(CDCData[CDCData$genhlth == "excellent",])) # 4,619
nrow(count(CDCData[CDCData$genhlth == "excellent",]))  / nrow(CDCData)


table(CDCData$gender,CDCData$smoke100)

mosaicplot(table(CDCData$gender,CDCData$smoke100))


# Exercise 3 - What does the mosaic plot reveal 
# about smoking habits and gender?
# Mosaic plot reveals that the number of 
# women smokers are lesser compared to men


# Exercise 4 - Create a new object called under23_and_smoke 
# that contains all observations of respondents under 
# the age of 23 that have smoked 100 cigarettes in their lifetime. 
# Write the command you used to create the new object 
# as the answer to this exercise.
under23_and_smoke <- CDCData[ which(CDCData$age < 23 & CDCData$smoke100 == 1), ]


# Exercise 5 - What does this box plot show? 
# Pick another categorical variable from the data set 
# and see how it relates to BMI. 
# List the variable you chose, why you might think 
# it would have a relationship to BMI, and indicate what the figure seems to suggest.

# Answer: Box plot shows the distribution of the observations (cases). 
# It helps to identify the outliers and eliminate them.

bmi <- (CDCData$weight / CDCData$height^2) * 703
boxplot(bmi ~ CDCData$genhlth)
# The box plot between BMI and General Health condition 
# reveals the direct relationship between the health status and BMI - healthier people
# have better BMI metrics and have good control over their weight

boxplot(bmi ~ CDCData$age)
# Replacing the health condition with age factor shows that 
# the younger people have lower BMI and the BMI increases during the middle age. 
# As the people get older, they focus more on weight (and health) 
# so the BMI comes down for the older people