title: “R Notebook - Importing CDC Data” author: Adilene Garcia output: html_document: df_print: paged —

# load necessary packages
library(ggplot2)
# look at the BRFSS data
source("http://www.openintro.org/stat/data/cdc.R")
# another way to load data (from the internet)
# read.csv(url name)

0 means ‘no’ and 1 means ‘yes’ for the variables hlthplan and smoke100

# plot weight v height
plot(cdc$weight ~ cdc$height)

Practice: Take 3-5 min to try to make this plot in ggplot. Hint: you will use + geom_line() instead of geom_histogram()

# histogram
ggplot(data = cdc, aes(x = height)) + 
  geom_histogram(binwidth = 20) 

# line graph
ggplot(data = cdc, aes(x = height, y = weight)) + geom_line()

# scatter plot graph
ggplot(data = cdc, aes(x = height, y = weight)) + geom_point() 

# access certain rows or columns with brackets
# this is the 567th observation for the 6th column
cdc[567,6]
## [1] 160
# To see the weight of the first 10 people
cdc[1:10, 6]
##  [1] 175 125 105 132 150 114 194 170 150 180
# rows, columns
# if you want all the rows of a certain column
# cdc[,6]
# if you want all the columns for the first ten observations
cdc[1:10,]
##      genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1       good       0        1        0     70    175      175  77      m
## 2       good       0        1        1     64    125      115  33      f
## 3       good       1        1        1     60    105      105  49      f
## 4       good       1        1        0     66    132      124  42      f
## 5  very good       0        1        0     61    150      130  55      f
## 6  very good       1        1        0     64    114      114  55      f
## 7  very good       1        1        0     71    194      185  31      m
## 8  very good       0        1        0     67    170      160  45      m
## 9       good       0        1        1     65    150      130  27      f
## 10      good       1        1        0     70    180      170  44      m
# access columns by name
#cdc$height

Practice: Use bracket notation to make a scatterplot of height and weight for the first 100 respondents. There are multiple ways to do this—find one that works!

# class example
randomrows <- sample(20000, 100)
# using ggplot
# right now, it uses all the data. Make a change so that the only first 100 rows are used
# a) change to last one hundred rows or random 100 rows
# scatter plot graph
ggplot(data = cdc[randomrows,], aes(x = height, y = weight)) + geom_point() 

# base R plot
plot(cdc$height[1:100], cdc$weight[1:100])

# enter comments from picture on phone feb 2
# calculating 5-number summary
summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00
# creating a box plot
ggplot(data = cdc, aes(x = age)) + geom_boxplot() 

# conditioning (which function) for age <= 30 creacting a subset 
cdc30_rows <- which(cdc$age <= 30)
# use the rows to make a new data set
cdc30 <- cdc[cdc30_rows, ]
4634/20000
## [1] 0.2317
# count how many people in cdc30 identify as 'f'
#uncomment the 2 lines below after uou create the cdc30 dataset
length(which(cdc30$gender == 'f'))
## [1] 2409
ggplot(data = cdc30, aes(x = gender)) + geom_bar()

Practice:

The ages of the participant for this study were between 18 to 99 years old. About half of the participants inn this study are between the ages of 31-57 years old. Most of the participants were around 40 years old. There are two out liers in this study which are the people that are 99 years old.

# calculating 5-number summary
summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00
# creating a box plot
ggplot(data = cdc, aes(x = age)) + geom_boxplot() 

# conditioning (which function) for age >= 50 creating a subset 
cdc50_rows <- which(cdc$age >= 50)
# use the rows to make a new data set
cdc50 <- cdc[cdc50_rows, ]
7343/20000
## [1] 0.36715