# DATA 606 Week 1 Lab 1 - Introduction to Data
# Student Name: Kalyan (Kalyanaraman Parthasarathy)
# load library
library(plyr)
# Reading the CDC data from the source as CSV file.
# I had difficulty in getting the data installed in my system
# so directly getting it from the web
CDCData = read.csv("http://www.openintro.org/stat/data/cdc.csv")
names(CDCData)
# Exercise 1 - How many cases are there in this data set?
# How many variables? For each variable,
# identify its data type (e.g. categorical, discrete).
# How many cases are there in the data set - 20,000
nrow(CDCData)
# How many variables - 9
ncol(CDCData)
# For each variable, identify its data type (e.g. categorical, discrete).
head(CDCData)
# genhlth --> Categorical
# exerany --> Categorical
# hlthplan --> Categorical
# smoke100 --> Categorical
# height --> Numeric (Continuous)
# weight --> Numeric (Continuous)
# wtdesire --> Numeric (Continuous)
# age --> Numeric (Continuous)
# gender --> Categorical
# Exercise 2 - Create a numerical summary for height
# and age, and compute the interquartile range for each.
# Compute the relative frequency distribution for gender and exerany.
# How many males are in the sample? What proportion of
# the sample reports being in excellent health?
# Create a numerical summary for height and age,
# and compute the interquartile range for each.
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 48.00 64.00 67.00 67.18 70.00 93.00
summary(CDCData$height)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 18.00 31.00 43.00 45.07 57.00 99.00
summary(CDCData$age)
# [1] 6
IQR(CDCData$height)
# [1] 26
IQR(CDCData$age)
# Compute the relative frequency distribution for gender and exerany.
# Table function is used to find the frequency distribution
# and then divide the total population to get relative frequency distribution
# Gender
table(CDCData$gender) / nrow(CDCData)
# exerany
table(CDCData$exerany) / nrow(CDCData)
# How many males are in the sample? What proportion of
# the sample reports being in excellent health?
# Males in the sample - 9,501
nrow(count(CDCData[CDCData$gender == "m",]))
# What proportion of the sample reports being in excellent health?
nrow(count(CDCData[CDCData$genhlth == "excellent",])) # 4,619
nrow(count(CDCData[CDCData$genhlth == "excellent",])) / nrow(CDCData)
table(CDCData$gender,CDCData$smoke100)
mosaicplot(table(CDCData$gender,CDCData$smoke100))
# Exercise 3 - What does the mosaic plot reveal
# about smoking habits and gender?
# Mosaic plot reveals that the number of
# women smokers are lesser compared to men
# Exercise 4 - Create a new object called under23_and_smoke
# that contains all observations of respondents under
# the age of 23 that have smoked 100 cigarettes in their lifetime.
# Write the command you used to create the new object
# as the answer to this exercise.
under23_and_smoke <- CDCData[ which(CDCData$age < 23 & CDCData$smoke100 == 1), ]
# Exercise 5 - What does this box plot show?
# Pick another categorical variable from the data set
# and see how it relates to BMI.
# List the variable you chose, why you might think
# it would have a relationship to BMI, and indicate what the figure seems to suggest.
# Answer: Box plot shows the distribution of the observations (cases).
# It helps to identify the outliers and eliminate them.
bmi <- (CDCData$weight / CDCData$height^2) * 703
boxplot(bmi ~ CDCData$genhlth)
# The box plot between BMI and General Health condition
# reveals the direct relationship between the health status and BMI - healthier people
# have better BMI metrics and have good control over their weight
boxplot(bmi ~ CDCData$age)
# Replacing the health condition with age factor shows that
# the younger people have lower BMI and the BMI increases during the middle age.
# As the people get older, they focus more on weight (and health)
# so the BMI comes down for the older people