Exercise 1. How many cases are there in this data set? How many variables? For each variable, identify its data type (e.g. categorical, discrete).
source("http://www.openintro.org/stat/data/cdc.R")
dim(cdc)
## [1] 20000 9
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
library(knitr)
library(kableExtra)
#Sampling the data
kable(cdc[sample(nrow(cdc), 10), ]) %>% kable_styling(bootstrap_options = "striped", full_width = F)
| genhlth | exerany | hlthplan | smoke100 | height | weight | wtdesire | age | gender | |
|---|---|---|---|---|---|---|---|---|---|
| 13187 | excellent | 0 | 1 | 1 | 60 | 120 | 118 | 34 | f |
| 3354 | excellent | 1 | 0 | 0 | 74 | 210 | 200 | 19 | m |
| 10796 | very good | 1 | 1 | 0 | 60 | 108 | 108 | 75 | f |
| 12724 | poor | 1 | 1 | 1 | 63 | 108 | 118 | 63 | f |
| 7108 | very good | 0 | 1 | 1 | 70 | 195 | 170 | 60 | m |
| 268 | excellent | 1 | 1 | 0 | 74 | 185 | 180 | 47 | m |
| 18087 | good | 0 | 1 | 0 | 65 | 134 | 128 | 52 | f |
| 12329 | good | 1 | 1 | 0 | 62 | 200 | 150 | 67 | f |
| 1154 | good | 0 | 1 | 1 | 75 | 300 | 250 | 40 | m |
| 2237 | good | 0 | 1 | 1 | 68 | 170 | 170 | 76 | m |
Variable = c('genhlth', 'exerany', 'hlthplan', 'smoke100', 'height', 'weight', 'wtdesire', 'age', 'gender')
Quantitative = c('Categorical', 'Categorical', 'Categorical', 'Categorical', 'Numerical', 'Numerical', 'Numerical', 'Numerical', 'Categorical')
Qualitative = c('Ordinal', 'Nominal', 'Nominal', 'Nominal', 'Discrete', 'Discrete', 'Discrete', 'Discrete', 'Nominal')
Reason = c('Different Levels of data', 'Binary Value, either Yes or No', 'Binary Value, either Yes or No', 'Binary Value, either Yes or No', 'Possible Finite number', 'Possible Finite number', 'Possible Finite number', 'Possible Finite number', 'Have m or f - can be depicted as Binary')
datatype = data.frame(Variable, Quantitative, Qualitative, Reason)
#Datatype Analysis
kable(datatype) %>% kable_styling(bootstrap_options = "striped", full_width = F)
| Variable | Quantitative | Qualitative | Reason |
|---|---|---|---|
| genhlth | Categorical | Ordinal | Different Levels of data |
| exerany | Categorical | Nominal | Binary Value, either Yes or No |
| hlthplan | Categorical | Nominal | Binary Value, either Yes or No |
| smoke100 | Categorical | Nominal | Binary Value, either Yes or No |
| height | Numerical | Discrete | Possible Finite number |
| weight | Numerical | Discrete | Possible Finite number |
| wtdesire | Numerical | Discrete | Possible Finite number |
| age | Numerical | Discrete | Possible Finite number |
| gender | Categorical | Nominal | Have m or f - can be depicted as Binary |
Exercise 2. Create a numerical summary for ‘height’ and ‘age’, and compute the interquartile range for each. Compute the relative frequency distribution for ‘gender’ and ‘exerany’. How many males are in the sample? What proportion of the sample reports being in excellent health?
summary(cdc$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.00 64.00 67.00 67.18 70.00 93.00
summary(cdc$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 43.00 45.07 57.00 99.00
#InterQuartile Range = Upper Quartile - Lower Quartile
IQR(cdc$height)
## [1] 6
IQR(cdc$age)
## [1] 26
#Relative Frequency of gender
table(cdc$gender)/nrow(cdc)
##
## m f
## 0.47845 0.52155
#Relative Frequncey of exerany
table(cdc$exerany)/nrow(cdc)
##
## 0 1
## 0.2543 0.7457
#Males in the Sample
table(cdc$gender)['m']
## m
## 9569
#Proportion of Excellent Health
table(cdc$genhlth)['excellent']/nrow(cdc)
## excellent
## 0.23285
Exercise 3. What does the mosaic plot reveal about smoking habits and gender?
mosaicplot(table(cdc$gender, cdc$smoke100), main = "Gender Smoking Habits", color = TRUE, shade=TRUE, legend=TRUE)
## Warning: In mosaicplot.default(table(cdc$gender, cdc$smoke100), main = "Gender Smoking Habits",
## color = TRUE, shade = TRUE, legend = TRUE) :
## extra argument 'legend' will be disregarded
Mosaic Plot reveals that Males are having more Smoking habits than females
Exercise 4. Create a new object called ‘under23_and_smoke’ that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.
under23_and_smoke <- subset(cdc, age < 23 & smoke100 == 1)
summary(under23_and_smoke)
## genhlth exerany hlthplan smoke100
## excellent:110 Min. :0.0000 Min. :0.0000 Min. :1
## very good:244 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:1
## good :204 Median :1.0000 Median :1.0000 Median :1
## fair : 53 Mean :0.8145 Mean :0.6952 Mean :1
## poor : 9 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1
## Max. :1.0000 Max. :1.0000 Max. :1
## height weight wtdesire age gender
## Min. :59.00 Min. : 85.0 Min. : 80.0 Min. :18.00 m:305
## 1st Qu.:65.00 1st Qu.:130.0 1st Qu.:125.0 1st Qu.:19.00 f:315
## Median :68.00 Median :155.0 Median :150.0 Median :20.00
## Mean :67.92 Mean :158.9 Mean :152.2 Mean :20.22
## 3rd Qu.:71.00 3rd Qu.:180.0 3rd Qu.:175.0 3rd Qu.:21.00
## Max. :79.00 Max. :350.0 Max. :315.0 Max. :22.00
dim(under23_and_smoke)
## [1] 620 9
Exercise 5. What does this box plot show? Pick another categorical variable from the data set and see how it relates to BMI. List the variable you chose, why you might think it would have a relationship to BMI, and indicate what the figure seems to suggest.
#BMI = (Weight In Pounds * 703 ) / (Height In Inches * Height In Inches)
bmi <- (cdc$weight * 703 ) / (cdc$height^2)
boxplot(bmi ~ cdc$gender, main="Gender BMI", font.main=3, cex.main=1.2, xlab="Gender", ylab="BMI", col="green")
boxplot(bmi ~ cdc$smoke100, main="Smoke100 BMI", font.main=3, cex.main=1.2, xlab="Smoke Level", ylab="BMI", col="red")
boxplot(bmi ~ cdc$genhlth, main="General Health BMI", font.main=3, cex.main=1.2, xlab="Health Grade", ylab="BMI", col="blue")
BMI is lower on females compared to males
Smoke Level is not a factor affecting BMI
Health Grade and BMI shows that good BMI has excellent health grade
Question 1. Make a scatterplot of weight versus desired weight. Describe the relationship between these two variables.
library(ggplot2)
ggplot(cdc, aes(weight, wtdesire, color = weight)) +
geom_point() +
theme_minimal() +
scale_color_gradient(low = "#0091ff", high = "#f0650e")
Weight and desired weight increases steadily
Question 2. Let’s consider a new variable: the difference between desired weight (‘wtdesire’) and current weight (‘weight’). Create this new variable by subtracting the two columns in the data frame and assigning them to a new object called ‘wdiff’.
wdiff <- cdc$wtdesire-cdc$weight
Question 3. What type of data is ‘wdiff’? If an observation ‘wdiff’ is 0, what does this mean about the person’s weight and desired weight. What if ‘wdiff’ is positive or negative?
str(wdiff)
## int [1:20000] 0 -10 0 -8 -20 0 -9 -10 -20 -10 ...
summary(wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
<0 implies weight loss
0 implies Desired weight
>0 implies weight gain
Question 4. Describe the distribution of ‘wdiff’ in terms of its center, shape, and spread, including any plots you use. What does this tell us about how people feel about their current weight?
summary(wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
plot(density(wdiff), main="Density")
plot(wdiff, main="Distribution of weight Difference")
polygon(wdiff, col="red", border="blue")
weight and desired weight is positive, there is no much difference to make it out
Question 5. Using numerical summaries and a side-by-side box plot, determine if men tend to view their weight differently than women.
ggplot(cdc, aes(x=gender, y=wtdesire, fill=weight)) + geom_boxplot(fill = "white", colour = "#3366FF",outlier.colour = "red", outlier.shape = 1)
Male’s is mostly on desired weight compared to female’s
Question 6 Now it’s time to get creative. Find the mean and standard deviation of ‘weight’ and determine what proportion of the weights are within one standard deviation of the mean.
#Summary information to find the mean
summary(cdc$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 140.0 165.0 169.7 190.0 500.0
sd(cdc$weight, na.rm = FALSE)
## [1] 40.08097
plot(table(cdc$weight) / length(cdc$weight),col = rainbow(25), main = "Propotion of weight")
prop_weight_1sdmean <- subset(cdc, cdc$weight < mean(cdc$weight) + sd(cdc$weight) & cdc$weight > mean(cdc$weight) - sd(cdc$weight))
nrow(prop_weight_1sdmean)/nrow(cdc)
## [1] 0.7076