source("https://raw.githubusercontent.com/czhu505/W1_lab1_606/master/cdc.R")

Q1. How many cases are there in this data set? How many variables? For each variable, identify its data type (e.g. categorical, discrete).

dim(cdc)
## [1] 20000     9
# 20000 dataset and 9 variables
str(cdc)
## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
#variable names,data types,and values
#categorical has limited values, usually fixed, like genhlth,exerany,hlthplan,smoke100,gender
#discrete:height,weight,wtdesire,age

Q2. Create a numerical summary for height and age, and compute the interquartile range for each. Compute the relative frequency distribution for gender and exerany. How many males are in the sample? What proportion of the sample reports being in excellent health?

summary(cdc$height) #summary for height
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00
# interquartile rang for 
interq_height<-70-64 #interquartile range of height
interq_height
## [1] 6
summary(cdc$age) #summary for age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   31.00   43.00   45.07   57.00   99.00
interq_age<-57-31 #interquartile range of age
interq_age
## [1] 26
table(cdc$gender,cdc$exerany)/20000 #relative frequency distribution for gender and exerany
##    
##           0       1
##   m 0.10745 0.37100
##   f 0.14685 0.37470
nrow(cdc[cdc$gender=="m",]) #males
## [1] 9569
nrow(cdc[cdc$genhlth=="excellent",])/20000 #proportion of the sample reports being in excellent health
## [1] 0.23285

Q3. What does the mosaic plot reveal about smoking habits and gender?

#There are lesser femal having 100 cigarette than man. 

On Your Own

plot(x=cdc$weight, y=cdc$wtdesire, type="p")

#They are positive corelation.
wdiff<-cdc$wtdesire-cdc$weight
str(wdiff)
##  int [1:20000] 0 -10 0 -8 -20 0 -9 -10 -20 -10 ...
# 'wdiff=0' means wtdesire equal to weihgt.
# 'wdiff<0' means wtdesire is lesser than weihgt.
# 'wdiff>0' means wtdesire is greater than weihgt.
summary(wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -300.00  -21.00  -10.00  -14.59    0.00  500.00
#1st and 3rd Qu, median and mean are negative,which tell us many more people desired lesser weight. 
boxplot(cdc$weight ~ cdc$gender)

#woman is less weight comparing to men.
mean(cdc$weight)
## [1] 169.683
var(cdc$weight)
## [1] 1606.484
summary(cdc$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0
nrow(cdc[cdc$weight<=140,])/20000 # proportion of the weights are within one standard deviation of the mean
## [1] 0.2671