Lab 1

load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/cdc.Rdata"))

Note that the “load()” statement automatically creates the data set “cdc” so there is no need to assign the data to “cdc”. Also, when we are in markup we don't have to use the hashtags to comment our code because everthing outside of the “ marks is ignored by the interpreter.

Now check to see that the data is loaded.

head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f

Now inspect the names of the columns in the data set:

names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
## [7] "wtdesire" "age"      "gender"

Try the tail and dim functions:

tail(cdc)
##         genhlth exerany hlthplan smoke100 height weight wtdesire age
## 19995      good       0        1        1     69    224      224  73
## 19996      good       1        1        0     66    215      140  23
## 19997 excellent       0        1        0     73    200      185  35
## 19998      poor       0        1        0     65    216      150  57
## 19999      good       1        1        0     67    165      165  81
## 20000      good       1        1        1     69    170      165  83
##       gender
## 19995      m
## 19996      f
## 19997      m
## 19998      f
## 19999      f
## 20000      m
dim(cdc)
## [1] 20000     9

Find the level of measurement of 'genhlth' and 'weight' and 'smoke100'

class(cdc$genhlth)
## [1] "factor"
class(cdc$weight)
## [1] "integer"
class(cdc$smoke100)
## [1] "numeric"

It turns out that I had changed smoke100 into a numeric variable as some earlier point so now I have to change it back with the code:

cdc$smoke100 <- as.factor(cdc$smoke100)
class(cdc$smoke100)
## [1] "factor"

Refresher exercise:

head(cdc$height)
## [1] 70 64 60 66 61 64
sum <- 84941 + 19686
mult <- 73 * 51

Now we get the basic summary statistics about a variable using the weight variable. These are called descriptive statistics because they describe the characteristics of one variable (as opposed to inferential statistics which generally describe the relationship between two or more variables).

mean(cdc$weight)
## [1] 169.7
var(cdc$weight)
## [1] 1606
median(cdc$weight)
## [1] 165
summary(cdc$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      68     140     165     170     190     500

Now working with catagorical data, data where all we can do is not whether a particular case is in the category or not. So, smoke100 tells us whether you smoked or not, 1 = yes, 0 = no. We use the table() function:

table(cdc$smoke100)
## 
##     0     1 
## 10559  9441

Now get a relative requency table frequency table for genhlth.

table(cdc$genhlth)
## 
## excellent very good      good      fair      poor 
##      4657      6972      5675      2019       677
relFreq <- table(cdc$genhlth)/nrow(cdc)
relFreq
## 
## excellent very good      good      fair      poor 
##   0.23285   0.34860   0.28375   0.10095   0.03385

Notice how the fractions all add up to 1, as they should.

Now we make some cool pictures. with by nesting the table for smoke100 inside the barplot() function.

barplot(table(cdc$smoke100))

plot of chunk unnamed-chunk-11

Answering the quiz questions about gender

summary(cdc$gender)
##     m     f 
##  9569 10431

Now make a mosaic plot. This involves crosstabulation which we do by using the table command

gender_smokers <- table(cdc$gender, cdc$smoke100)
mosaicplot(gender_smokers)

plot of chunk unnamed-chunk-13

names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
## [7] "wtdesire" "age"      "gender"
# This tells us that the index number for height is 5, i.e., that it is the
# fifth column. We can use that information to get the height of the 1337th
# respondent.
height_1337 <- cdc[1337, 5]
height_1337
## [1] 70
weight_111 <- cdc[111, 6]
weight_111
## [1] 210

Now use the range function

first8 <- cdc[1:8, 3:5]
first8
##   hlthplan smoke100 height
## 1        1        0     70
## 2        1        1     64
## 3        1        1     60
## 4        1        0     66
## 5        1        0     61
## 6        1        0     64
## 7        1        0     71
## 8        1        0     67
wt_gen_10_20 <- cdc[10:20, 5:6]
wt_gen_10_20
##    height weight
## 10     70    180
## 11     69    186
## 12     69    168
## 13     66    185
## 14     70    170
## 15     69    170
## 16     73    185
## 17     67    156
## 18     71    185
## 19     75    200
## 20     67    125

Get everything on respondent 205

resp205 <- cdc[205, ]
ht_wt <- cdc[, 5:6]

resp205
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 205    fair       0        0        1     61    200      125  49      f

# Here we use head to keep from printing out a mile of numbers
head(ht_wt)
##   height weight
## 1     70    175
## 2     64    125
## 3     60    105
## 4     66    132
## 5     61    150
## 6     64    114

Now access the data by using the names of columns

resp1000_smk <- cdc$smoke100[1000]
resp1000_smk
## [1] 1
## Levels: 0 1
first30_ht = cdc$height[1:30]
first30_ht
##  [1] 70 64 60 66 61 64 71 67 65 70 69 69 66 70 69 73 67 71 75 67 69 65 73
## [24] 67 64 68 67 69 61 74

Create a variable for being over 50

cdc$over50 <- cdc$age >= 50
head(cdc$over50)
## [1]  TRUE FALSE FALSE FALSE  TRUE  TRUE
class(cdc$over50)
## [1] "logical"
cdc$over50 <- as.factor(cdc$over50)
class(cdc$over50)
## [1] "factor"

over50_smoker <- table(cdc$over50, cdc$smoke100)
over50_smoker
##        
##            0    1
##   FALSE 7149 5508
##   TRUE  3410 3933
mosaicplot(over50_smoker)

plot of chunk unnamed-chunk-18

Now we want to create a subset of the data that has only the people who report having good health.

names(cdc)
##  [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
##  [7] "wtdesire" "age"      "gender"   "over50"
cdc$very_good <- subset(cdc, genhlth == "very good")
## Error: replacement has 6972 rows, data has 20000
# Here I have made the mistake of trying to create an addition to the cdc
# data set when what is really wanted is a new data set.
very_good <- subset(cdc, genhlth == "very good")
head(very_good)
##      genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 5  very good       0        1        0     61    150      130  55      f
## 6  very good       1        1        0     64    114      114  55      f
## 7  very good       1        1        0     71    194      185  31      m
## 8  very good       0        1        0     67    170      160  45      m
## 20 very good       1        1        0     67    125      120  33      f
## 21 very good       1        1        0     69    200      150  48      f
##    over50
## 5    TRUE
## 6    TRUE
## 7   FALSE
## 8   FALSE
## 20  FALSE
## 21  FALSE
# Note how it gives us only the cases that have general health reported as
# very good, including the case ids which, instead of running from 1 to 6,
# run from 5, 6, 7, 20, 21.

Now do the same for people over 50.

ageGtr50 <- subset(cdc, age >= "50")
head(ageGtr50)
##      genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1       good       0        1        0     70    175      175  77      m
## 5  very good       0        1        0     61    150      130  55      f
## 6  very good       1        1        0     64    114      114  55      f
## 12      fair       1        1        1     69    168      148  62      m
## 14 excellent       1        1        1     70    170      170  69      m
## 16      good       1        1        1     73    185      175  79      m
##    over50
## 1    TRUE
## 5    TRUE
## 6    TRUE
## 12   TRUE
## 14   TRUE
## 16   TRUE

So now we use logical conditionals to create a subset of the data based on two or more characteristics, in this case smokers under the age of 23:

under23_and_smoke <- subset(cdc, age < 23 & smoke100 == 1)
head(under23_and_smoke)
##       genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 13  excellent       1        0        1     66    185      220  21      m
## 37  very good       1        0        1     70    160      140  18      f
## 96  excellent       1        1        1     74    175      200  22      m
## 180      good       1        1        1     64    190      140  20      f
## 182 very good       1        1        1     62     92       92  21      f
## 240 very good       1        0        1     64    125      115  22      f
##     over50
## 13   FALSE
## 37   FALSE
## 96   FALSE
## 180  FALSE
## 182  FALSE
## 240  FALSE
nrow(under23_and_smoke)
## [1] 620

Now for some pictures that help us visualize descriptive statistics, starting with comparing boxplot with the numerical descriptives provided by summary().

boxplot(cdc$height)

plot of chunk unnamed-chunk-22

summary(cdc$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    48.0    64.0    67.0    67.2    70.0    93.0

Now we look at functions that introduce the use of the '~' operator, which reads 'as a function of' or, more colloquially, 'versus'. We use this to compare the weights of respondents that smoke versus those that don't, or, in other words, look at weight as a function of smoking.

boxplot(cdc$smoke100 ~ cdc$weight)
## Error: adding class "factor" to an invalid object
# Note that this produces a mess because we have put weight on the x axis.
# We read the statements formed with the ~ operator as 'y as a function of
# x', so the variable that is going to define the categories, or the x
# variable, shoudl come second. In this case we want to see the weight of
# resondents compared by whether or not they smoke, i.e., we want to see
# weight (the y variable) as a function of smoking (the x variable), or:
boxplot(cdc$weight ~ cdc$smoke100)

plot of chunk unnamed-chunk-23

Now we look at weather the reported general health of respondents is realated to the bmi of respondents. We are asking if healthier people weigh less on average. So we want to see health ~ bmi. We first calcuate the bmi for each respondent by the expression weight/height2*703. Then we use that as the dependent variable in a boxplot display.

cdc$bmi <- (cdc$weight/cdc$height^2) * 703
head(cdc$bmi)
## [1] 25.11 21.45 20.50 21.30 28.34 19.57
boxplot(cdc$bmi ~ cdc$genhlth)

plot of chunk unnamed-chunk-24

Finally, we create some histograms. Histograms are a good way to visualize the distribution–how spread out something is, of a variable. We are going to look at the bmi variable and vary the breaks in the histogram btween the default, 50 and 100.

hist(cdc$bmi)

plot of chunk unnamed-chunk-25

hist(cdc$bmi, breaks = 50)

plot of chunk unnamed-chunk-25

hist(cdc$bmi, breaks = 100)

plot of chunk unnamed-chunk-25

Now look at a scatter plot. Scatter plots are used for comparing the relationship between two interval level variables. Here we will plot the relationship between weight and desired weight. We will treat weight as the independent variable or the x variable.

plot(cdc$wtdesire ~ cdc$weight)

plot of chunk unnamed-chunk-26

This is an example of a moderately weak, positive linear association.