source("more/cdc.R")
names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
## [7] "wtdesire" "age"      "gender"

This returns the names genhlth, exerany, hlthplan, smoke100, height, weight, wtdesire, age, and gender. Each one of these variables corresponds to a question that was asked in the survey. For example, for genhlth, respondents were asked to evaluate their general health, responding either excellent, very good, good, fair or poor. The exerany variable indicates whether the respondent exercised in the past month (1) or did not (0). Likewise, hlthplan indicates whether the respondent had some form of health coverage (1) or did not (0). The smoke100 variable indicates whether the respondent had smoked at least 100 cigarettes in her lifetime. The other variables record the respondent’s height in inches, weight in pounds as well as their desired weight, wtdesire, age in years, and gender.

  1. How many cases are there in this data set? How many variables? For each variable, identify its data type (e.g. categorical, discrete).

Return dimensions(rows and columns), use the str command, define each column’s data type

dim(cdc)
## [1] 20000     9
str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
##   Column_name Numerical.Categorical Discrete.Coninuous.Ordinal
## 1     exerany           Categorical                       <NA>
## 2    hlthplan           Categorical                       <NA>
## 3    smoke100           Categorical                       <NA>
## 4      height             Numerical                   Discrete
## 5      weight             Numerical                   Discrete
## 6    wtdesire             Numerical                   Discrete
## 7         age             Numerical                   Discrete
## 8      gender           categorical                       <NA>
## 9       wdiff             Numerical                   Discrete
  1. Create a numerical summary for height and age, and compute the interquartile range for each. Compute the relative frequency distribution for gender and exerany. How many males are in the sample? What proportion of the sample reports being in excellent health?

Create an interquartile function and print out several frequency distribution functions

summary(cdc$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00
summary(cdc$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   31.00   43.00   45.07   57.00   99.00
#Create IQR FUNCTION
quartile_range <- function(x){sorted <- sort(x)
    lower=median(sorted[1:10000])
    upper=median(sorted[10001:20000])
    inter= upper-lower
    return (c("1st_quart"=lower,"3rd_quart"=upper,"inter_quart"=inter))

}

#CREATE FREQUENCY TABLES
exercise_freq_dist <- table(cdc$exerany)/20000*100
gender_freq_dist <-   table(cdc$gender)/20000*100
healt_freq_dist <-    table(cdc$genhlth)/20000*100
males_in_sample <- gender_freq_dist[1]*200
great_health <- healt_freq_dist[1] *200

# USE PASTE FUNCTION FOR PRINT STATEMENTS
heights <- (paste(c("height_1st_quart is","height_3rd_quart is","height_inter_quartile range is"),c(quartile_range(cdc$height))))
ages <- (paste(c("age_1st_quart is","age_3rd_quart is","age_inter_quartile range is"),c(quartile_range(cdc$age))))
genders <- paste(c("percent males is","percent females is"),c(gender_freq_dist))
exercised <-paste(c("percent whom didn't exercise","percent whom did exercise"),c(exercise_freq_dist)) 
sampled_males <- paste("number of males",males_in_sample)
excellent_reported_health <- paste("number of respondents reporting excellent health",great_health)


print (c(heights,ages,genders,exercised,sampled_males,excellent_reported_health))
##  [1] "height_1st_quart is 64"                               
##  [2] "height_3rd_quart is 70"                               
##  [3] "height_inter_quartile range is 6"                     
##  [4] "age_1st_quart is 31"                                  
##  [5] "age_3rd_quart is 57"                                  
##  [6] "age_inter_quartile range is 26"                       
##  [7] "percent males is 47.845"                              
##  [8] "percent females is 52.155"                            
##  [9] "percent whom didn't exercise 25.43"                   
## [10] "percent whom did exercise 74.57"                      
## [11] "number of males 9569"                                 
## [12] "number of respondents reporting excellent health 4657"
mosaicplot(table(cdc$gender,cdc$smoke100))

  1. Create a new object called under23_and_smoke that contains all observations of respondents under the age of 23 that have smoked 100 cigarettes in their lifetime. Write the command you used to create the new object as the answer to this exercise.

Subset of young smokers and a frequency distribution for that young smoker subset

#CREATE SUBSET DATA AND PASTE FOR PRINT
under23_and_smoke <- subset(cdc, age<23,smoke100)
younger_smokers <- paste(c("distribution of  people under 23 that don't smoke ","distribution of people under 23 that do smoke"),c(table(under23_and_smoke)/dim(under23_and_smoke)[1]))
over_55_smokers <- subset(cdc, age>55,smoke100)
older_smokers <- paste(c("distribution of  people over 55 that don't smoke ","distribution of  people over 55 that do smoke"),c(table(over_55_smokers)/dim(over_55_smokers)[1]))


print(c(younger_smokers, older_smokers))
## [1] "distribution of  people under 23 that don't smoke  0.61609907120743"
## [2] "distribution of people under 23 that do smoke 0.38390092879257"     
## [3] "distribution of  people over 55 that don't smoke  0.457614542756446"
## [4] "distribution of  people over 55 that do smoke 0.542385457243554"

Quantitative data

boxplot(cdc$height)

You can compare the locations of the components of the box by examining the summary statistics.

summary(cdc$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00

The notation here is new. The ~ character can be read versus or as a function of. So we’re asking R to give us a box plots of heights where the groups are defined by gender.

Next let’s consider a new variable that doesn’t show up directly in this data set: Body Mass Index (BMI) (http://en.wikipedia.org/wiki/Body_mass_index). BMI is a weight to height ratio and can be calculated as:

\[ BMI = \frac{weight~(lb)}{height~(in)^2} * 703 \]

703 is the approximate conversion factor to change units from metric (meters and kilograms) to imperial (inches and pounds).

The following two lines first make a new object called bmi and then creates box plots of these values, defining groups by the variable cdc$genhlth.

bmi <- (cdc$weight / cdc$height^2) * 703
boxplot(bmi ~ cdc$genhlth)

hist(bmi)

  1. What does this box plot show? Pick another categorical variable from the data set and see how it relates to BMI. List the variable you chose, why you might think it would have a relationship to BMI, and indicate what the figure seems to suggest.

Boxplot of bmi categorized by exercise column

boxplot(bmi ~ cdc$exerany)

summary(cdc$exerany==1)
##    Mode   FALSE    TRUE 
## logical    5086   14914
hist(bmi, breaks = 50)

On Your Own

library(stringr)
library(XML)
library(maps)
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked _by_ '.GlobalEnv':
## 
##     heights
library(sjmisc)
library(ggplot2) 
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
  1. Make a scatter plot of weight versus desired weight. Describe the relationship between these two variables.
library(ggplot2)
ggplot(cdc, aes(x = cdc$weight, y = cdc$wtdesire)) +
    theme_bw() +
    geom_point()

Eliminate outlier under the assumption that noone wants to weigh over 500 pounds

  1. Let’s consider a new variable: the difference between desired weight (wtdesire) and current weight (weight). Create this new variable by subtracting the two columns in the data frame and assigning them to a new object called wdiff.
#CREATE WDIFF COLUMN AND PRINT PLOT OF DATA WITHOUT OUTLIER
cdc$wdiff <-cdc$weight-cdc$wtdesire
sub_set_cdc <- subset(cdc,cdc$wdiff> -300)
library(ggplot2)
ggplot(sub_set_cdc, aes(x = sub_set_cdc$weight, y = sub_set_cdc$wtdesire)) +
    theme_bw() +
    geom_point()

  1. What type of data is wdiff? If an observation wdiff is 0, what does this mean about the person’s weight and desired weight. What if wdiff is positive or negative?

Some visual display and summary of our new Wdiff column

hist(sub_set_cdc$wdiff,breaks=41)

summary(sub_set_cdc$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -110.00    0.00   10.00   14.63   21.00  300.00
summary(cdc$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -500.00    0.00   10.00   14.59   21.00  300.00
d <- density(cdc$wdiff)
plot(d, main="Kernel Density of desired weight")
polygon(d, col="red", border="blue")

  1. Describe the distribution of wdiff in terms of its center, shape, and spread, including any plots you use. What does this tell us about how people feel about their current weight?
  1. Using numerical summaries and a side-by-side box plot, determine if men tend to view their weight differently than women.

WDIFF plotted side by side by gender

# SUBSET WDIFF COLUMN BY GENDER, PLOT GENDERS SIDE BY SIDE
sub_set_cdc_women <-subset(sub_set_cdc,sub_set_cdc$gender=="f") 
sub_set_cdc_men <-subset(sub_set_cdc,sub_set_cdc$gender=="m") 
summary(sub_set_cdc_women$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -83.00    0.00   10.00   18.15   27.00  300.00
summary(sub_set_cdc_men$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -110.00    0.00    5.00   10.79   20.00  300.00
gender_df <- rbind(data.frame(fill="blue", obs=sub_set_cdc_women$wdiff),
            data.frame(fill="red", obs=sub_set_cdc_men$wdiff))
ggplot(gender_df, aes(x=obs, fill=fill)) +
  geom_histogram(binwidth=10, colour="black", position="dodge") +
  scale_fill_identity()

    • Now it’s time to get creative. Find the mean and standard deviation of weight and determine what proportion of the weights are within one standard deviation of the mean.
# GET MEAN, GET STD DEVIATION, CREATE A SUBSET WITHIN 1 STD DEVIATION IN EACH DIRECTION
mean_cdc <- mean(sub_set_cdc$weight)
sd_cdc <- sd(sub_set_cdc$weight)
stand_dev_upper <-mean_cdc+sd_cdc
stand_dev_lower <- mean_cdc-sd_cdc 
cdc_data_within_sd <- subset(sub_set_cdc,sub_set_cdc$weight<stand_dev_upper &sub_set_cdc$weight>stand_dev_lower )
proportion_within_sd <- dim(cdc_data_within_sd)[1]/20000
print(paste(" the proportion of weights that fall within one standard deviation is",proportion_within_sd))
## [1] " the proportion of weights that fall within one standard deviation is 0.70755"

This is a product of OpenIntro that is released under a Creative Commons Attribution-ShareAlike 3.0 Unported. This lab was adapted for OpenIntro by Andrew Bray and Mine Çetinkaya-Rundel from a lab written by Mark Hansen of UCLA Statistics.