source("more/cdc.R")
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
names(cdc)
## [1] "genhlth" "exerany" "hlthplan" "smoke100" "height" "weight"
## [7] "wtdesire" "age" "gender"
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
summary of the Height and Age
summary(cdc$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.00 64.00 67.00 67.18 70.00 93.00
summary(cdc$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 43.00 45.07 57.00 99.00
interquartile range for Height
cat("Total: " ,summary(cdc$height)[5] - summary(cdc$height)[2])
## Total: 6
interquartile range for Age
cat("Total: " ,summary(cdc$age)[5] - summary(cdc$age)[2])
## Total: 26
relative frequency distribution for gender
table(cdc$gender) / length(cdc$gender)
##
## m f
## 0.47845 0.52155
relative frequency distribution for exerany
table(cdc$exerany) / length(cdc$exerany)
##
## 0 1
## 0.2543 0.7457
Male in the example
table(cdc$gender)[1]
## m
## 9569
proportion of excellent health
cat("Total % of Excellent Health", ((table(cdc$genhlth) / length(cdc$genhlth))[1]) * 100 )
## Total % of Excellent Health 23.285
mosaicplot(table(cdc$smoke100, cdc$gender))
we can observe in this graph plot that have men smoked more than women.
under23_and_smoke <- subset(cdc, cdc$smoke100 == 1 & cdc$age < 23)
head(under23_and_smoke, 10)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 13 excellent 1 0 1 66 185 220 21 m
## 37 very good 1 0 1 70 160 140 18 f
## 96 excellent 1 1 1 74 175 200 22 m
## 180 good 1 1 1 64 190 140 20 f
## 182 very good 1 1 1 62 92 92 21 f
## 240 very good 1 0 1 64 125 115 22 f
## 262 fair 0 1 1 71 185 185 20 m
## 296 fair 1 1 1 72 185 170 19 m
## 297 excellent 1 0 1 63 105 100 19 m
## 300 fair 1 1 1 71 185 150 18 m
bmi <- (cdc$weight / cdc$height^2) * 703
boxplot(bmi ~ cdc$exerany, xlab = "Exercise", ylab ="BMI")
I compared the bmi vs Exerany(Exercise) because i want to know if people who exercise or not will have an impact on the bmi and observing this graph we see the median is slower on people who does exercise.
plot(cdc$weight , cdc$wtdesire, col= "Blue")
seem a strong correlation on these variable
wdiff <- cdc$wtdesire - cdc$weight
str(wdiff)
## int [1:20000] 0 -10 0 -8 -20 0 -9 -10 -20 -10 ...
The data is numeric
If the value is positive, person still want to get more weight.
If the value is negative, person still want to lose weight.
If the value is 0, person is in the desire weight.
summary(wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
hist(wdiff)
Observing the graph, its shows that most of value are close to zero which demonstrate that most of the people is on the goal or close to their weight desire.
#add extra wdiff column
cdc$wdiff <- wdiff
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
## wdiff
## 1 0
## 2 -10
## 3 0
## 4 -8
## 5 -20
## 6 0
#summary for Male
summary(cdc$wdiff[cdc$gender == "m"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -20.00 -5.00 -10.71 0.00 500.00
#summary for Female
summary(cdc$wdiff[cdc$gender == "f"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -27.00 -10.00 -18.15 0.00 83.00
boxplot(cdc$wdiff ~ cdc$gender , col = "green", xlab= "Gender", ylab= "Difference")
observing the graph, we can see that both men and women wants to lose weight, but women seem to want for a little bit more in the graph to lose weight.
#mean
mean <- mean(cdc$weight)
#standard deviation
sd <- sd(cdc$weight)
total <- ((cdc$weight > (mean - sd) & cdc$weight < (mean + sd))) / length(cdc$weight)
cat(sum(total) * 100 ,"%")
## 70.76 %