source("http://www.openintro.org/stat/data/cdc.R")
names(cdc)
## [1] "genhlth" "exerany" "hlthplan" "smoke100" "height" "weight"
## [7] "wtdesire" "age" "gender"
## exercise 1; 20 thousands observations; 9 variables; genhlth - categorical, exerany - discrete, hlthplan - discrete, smoke100 - discrete, height - discrete, weight - discrete, age - discrete, gender - categorical
summary(cdc)
## genhlth exerany hlthplan smoke100
## excellent:4657 Min. :0.0000 Min. :0.0000 Min. :0.0000
## very good:6972 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## good :5675 Median :1.0000 Median :1.0000 Median :0.0000
## fair :2019 Mean :0.7457 Mean :0.8738 Mean :0.4721
## poor : 677 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## height weight wtdesire age gender
## Min. :48.00 Min. : 68.0 Min. : 68.0 Min. :18.00 m: 9569
## 1st Qu.:64.00 1st Qu.:140.0 1st Qu.:130.0 1st Qu.:31.00 f:10431
## Median :67.00 Median :165.0 Median :150.0 Median :43.00
## Mean :67.18 Mean :169.7 Mean :155.1 Mean :45.07
## 3rd Qu.:70.00 3rd Qu.:190.0 3rd Qu.:175.0 3rd Qu.:57.00
## Max. :93.00 Max. :500.0 Max. :680.0 Max. :99.00
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
tail(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age
## 19995 good 0 1 1 69 224 224 73
## 19996 good 1 1 0 66 215 140 23
## 19997 excellent 0 1 0 73 200 185 35
## 19998 poor 0 1 0 65 216 150 57
## 19999 good 1 1 0 67 165 165 81
## 20000 good 1 1 1 69 170 165 83
## gender
## 19995 m
## 19996 f
## 19997 m
## 19998 f
## 19999 f
## 20000 m
summary(cdc$weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 140.0 165.0 169.7 190.0 500.0
190 - 140
## [1] 50
mean(cdc$weight)
## [1] 169.683
var(cdc$weight)
## [1] 1606.484
median(cdc$weight)
## [1] 165
table(cdc$smoke100)
##
## 0 1
## 10559 9441
table(cdc$smoke100)/20000
##
## 0 1
## 0.52795 0.47205
barplot(table(cdc$smoke100))

## exercise 2 beginning
summary(cdc$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.00 64.00 67.00 67.18 70.00 93.00
IQR(cdc$height)
## [1] 6
summary(cdc$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 43.00 45.07 57.00 99.00
IQR(cdc$age)
## [1] 26
table(cdc$gender)
##
## m f
## 9569 10431
## male 9596
table(cdc$gender)/20000
##
## m f
## 0.47845 0.52155
table(cdc$exerany)/20000
##
## 0 1
## 0.2543 0.7457
table(cdc$genhlth)/20000
##
## excellent very good good fair poor
## 0.23285 0.34860 0.28375 0.10095 0.03385
## excelent health 0.23285
table(cdc$gender,cdc$smoke100)
##
## 0 1
## m 4547 5022
## f 6012 4419
mosaicplot(table(cdc$gender,cdc$smoke100))

## exercise 3; males have smoked more
dim(cdc)
## [1] 20000 9
cdc[567,6]
## [1] 160
names(cdc)
## [1] "genhlth" "exerany" "hlthplan" "smoke100" "height" "weight"
## [7] "wtdesire" "age" "gender"
cdc[1:10,6]
## [1] 175 125 105 132 150 114 194 170 150 180
1:10
## [1] 1 2 3 4 5 6 7 8 9 10
cdc[1:10,]
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
## 7 very good 1 1 0 71 194 185 31 m
## 8 very good 0 1 0 67 170 160 45 m
## 9 good 0 1 1 65 150 130 27 f
## 10 good 1 1 0 70 180 170 44 m
cdc$weight[567]
## [1] 160
mdata <- subset(cdc, cdc$gender == "m")
head(mdata)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 7 very good 1 1 0 71 194 185 31 m
## 8 very good 0 1 0 67 170 160 45 m
## 10 good 1 1 0 70 180 170 44 m
## 11 excellent 1 1 1 69 186 175 46 m
## 12 fair 1 1 1 69 168 148 62 m
m_and_over30 <- subset(cdc, gender == "m" & age > 30)
m_or_over30 <- subset(cdc, gender == "m" | age > 30)
## exercise 4;
under23_and_smoke <- subset(cdc, smoke100 == 1 | age < 23)
boxplot(cdc$height)

summary(cdc$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.00 64.00 67.00 67.18 70.00 93.00
bmi <- (cdc$weight / cdc$height^2) * 703
boxplot(bmi ~ cdc$genhlth)

## exercise 5; people with excelent health have lower bmi vs people with poor health (specially poor health people have a lot outliers with very high BMI)
boxplot(bmi ~ cdc$exerany)

## people who exercise have slightly lower BMI on average
hist(cdc$age)

hist(bmi)

hist(bmi, breaks = 50)

## On my own
## 1. The 2 variables have strong correlation
plot(cdc$wtdesire, cdc$weight, main="Weight desired vs weight",
xlab="Weight desired", ylab="weight ")

## 2.
cdc$wdiff=cdc$wtdesire-cdc$weight
head(cdc$wdiff)
## [1] 0 -10 0 -8 -20 0
## 3. it is discrete. 0 means that actual and desired weights match. positive means that a person potentially wants to gain wait. negative means that a person potentially wants to lose weight
## 4. Distribution is left skewed. Median is bigger than mean as expected for left skewed distribution. IQR is 21, so half of the people either do not want to lse any weight or want to lose up to 20 lbs, 25% want to gain weight, and another 25% want to lose more than 21 lbs. Much more people want to lose weight than to gain.
summary(cdc$wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -21.00 -10.00 -14.59 0.00 500.00
table(cdc$wdiff)
##
## -300 -246 -235 -220 -210 -200 -190 -180 -175 -170 -165 -160 -155 -152 -150
## 2 1 1 1 1 6 1 1 1 4 2 3 1 4 17
## -148 -147 -145 -142 -140 -139 -135 -133 -132 -130 -128 -126 -125 -122 -120
## 1 1 4 2 8 1 4 1 3 15 1 2 9 2 27
## -117 -115 -113 -112 -110 -109 -108 -107 -105 -103 -100 -99 -98 -97 -96
## 1 11 3 3 16 3 3 1 12 1 115 1 3 2 1
## -95 -94 -93 -92 -91 -90 -88 -87 -86 -85 -84 -83 -82 -81 -80
## 11 2 2 2 1 46 6 2 5 22 3 4 5 2 72
## -79 -78 -77 -76 -75 -74 -73 -72 -71 -70 -69 -68 -67 -66 -65
## 4 6 5 3 43 6 5 8 2 125 6 9 5 9 62
## -64 -63 -62 -61 -60 -59 -58 -57 -56 -55 -54 -53 -52 -51 -50
## 8 8 10 4 184 8 15 8 12 101 13 11 14 4 395
## -49 -48 -47 -46 -45 -44 -43 -42 -41 -40 -39 -38 -37 -36 -35
## 12 23 18 13 189 18 12 21 8 556 18 31 34 23 360
## -34 -33 -32 -31 -30 -29 -28 -27 -26 -25 -24 -23 -22 -21 -20
## 32 30 28 22 893 36 70 65 26 677 47 70 74 41 1467
## -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6 -5
## 43 141 69 73 1173 71 157 171 64 1856 103 234 234 132 1253
## -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10
## 131 188 125 51 5616 19 39 33 21 232 34 58 43 17 333
## 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 20 32 23 8 208 5 13 14 9 151 2 17 9 5 79
## 26 27 28 29 30 31 32 33 35 36 37 38 39 40 41
## 3 9 9 1 46 2 5 6 26 3 3 1 1 21 3
## 42 43 45 47 50 52 53 55 60 61 63 64 65 68 70
## 2 1 7 1 9 1 1 2 6 1 1 2 5 1 3
## 72 73 75 80 83 85 86 90 91 110 311 500
## 1 1 1 2 1 2 1 1 1 1 1 1
table(cdc$wdiff)/20000
##
## -300 -246 -235 -220 -210 -200 -190 -180 -175
## 0.00010 0.00005 0.00005 0.00005 0.00005 0.00030 0.00005 0.00005 0.00005
## -170 -165 -160 -155 -152 -150 -148 -147 -145
## 0.00020 0.00010 0.00015 0.00005 0.00020 0.00085 0.00005 0.00005 0.00020
## -142 -140 -139 -135 -133 -132 -130 -128 -126
## 0.00010 0.00040 0.00005 0.00020 0.00005 0.00015 0.00075 0.00005 0.00010
## -125 -122 -120 -117 -115 -113 -112 -110 -109
## 0.00045 0.00010 0.00135 0.00005 0.00055 0.00015 0.00015 0.00080 0.00015
## -108 -107 -105 -103 -100 -99 -98 -97 -96
## 0.00015 0.00005 0.00060 0.00005 0.00575 0.00005 0.00015 0.00010 0.00005
## -95 -94 -93 -92 -91 -90 -88 -87 -86
## 0.00055 0.00010 0.00010 0.00010 0.00005 0.00230 0.00030 0.00010 0.00025
## -85 -84 -83 -82 -81 -80 -79 -78 -77
## 0.00110 0.00015 0.00020 0.00025 0.00010 0.00360 0.00020 0.00030 0.00025
## -76 -75 -74 -73 -72 -71 -70 -69 -68
## 0.00015 0.00215 0.00030 0.00025 0.00040 0.00010 0.00625 0.00030 0.00045
## -67 -66 -65 -64 -63 -62 -61 -60 -59
## 0.00025 0.00045 0.00310 0.00040 0.00040 0.00050 0.00020 0.00920 0.00040
## -58 -57 -56 -55 -54 -53 -52 -51 -50
## 0.00075 0.00040 0.00060 0.00505 0.00065 0.00055 0.00070 0.00020 0.01975
## -49 -48 -47 -46 -45 -44 -43 -42 -41
## 0.00060 0.00115 0.00090 0.00065 0.00945 0.00090 0.00060 0.00105 0.00040
## -40 -39 -38 -37 -36 -35 -34 -33 -32
## 0.02780 0.00090 0.00155 0.00170 0.00115 0.01800 0.00160 0.00150 0.00140
## -31 -30 -29 -28 -27 -26 -25 -24 -23
## 0.00110 0.04465 0.00180 0.00350 0.00325 0.00130 0.03385 0.00235 0.00350
## -22 -21 -20 -19 -18 -17 -16 -15 -14
## 0.00370 0.00205 0.07335 0.00215 0.00705 0.00345 0.00365 0.05865 0.00355
## -13 -12 -11 -10 -9 -8 -7 -6 -5
## 0.00785 0.00855 0.00320 0.09280 0.00515 0.01170 0.01170 0.00660 0.06265
## -4 -3 -2 -1 0 1 2 3 4
## 0.00655 0.00940 0.00625 0.00255 0.28080 0.00095 0.00195 0.00165 0.00105
## 5 6 7 8 9 10 11 12 13
## 0.01160 0.00170 0.00290 0.00215 0.00085 0.01665 0.00100 0.00160 0.00115
## 14 15 16 17 18 19 20 21 22
## 0.00040 0.01040 0.00025 0.00065 0.00070 0.00045 0.00755 0.00010 0.00085
## 23 24 25 26 27 28 29 30 31
## 0.00045 0.00025 0.00395 0.00015 0.00045 0.00045 0.00005 0.00230 0.00010
## 32 33 35 36 37 38 39 40 41
## 0.00025 0.00030 0.00130 0.00015 0.00015 0.00005 0.00005 0.00105 0.00015
## 42 43 45 47 50 52 53 55 60
## 0.00010 0.00005 0.00035 0.00005 0.00045 0.00005 0.00005 0.00010 0.00030
## 61 63 64 65 68 70 72 73 75
## 0.00005 0.00005 0.00010 0.00025 0.00005 0.00015 0.00005 0.00005 0.00005
## 80 83 85 86 90 91 110 311 500
## 0.00010 0.00005 0.00010 0.00005 0.00005 0.00005 0.00005 0.00005 0.00005
(var(cdc$wdiff))^0.5
## [1] 24.04586
barplot(table(cdc$wdiff))

IQR(cdc$wdiff)
## [1] 21
boxplot(cdc$wdiff)

## 5. It seems that more men want to gain weight than women. Women prefer to lose more weight. Men distribution is right skewed, while women distribution is left skewed
boxplot(cdc$wdiff ~ cdc$gender)

m <- subset(cdc, gender == "m")
f <- subset(cdc, gender == "f")
hist(m$wdiff)

hist(f$wdiff)

summary(m$wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -20.00 -5.00 -10.71 0.00 500.00
summary(f$wdiff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -300.00 -27.00 -10.00 -18.15 0.00 83.00
## 6 84.7%
mean(cdc$weight)
## [1] 169.683
var(cdc$weight)^0.5
## [1] 40.08097
mysubset <- subset(cdc, weight <=(169.683+40.08097)|weight <=(169.683-40.08097))
head(mysubset)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
## wdiff
## 1 0
## 2 -10
## 3 0
## 4 -8
## 5 -20
## 6 0
16935/20000
## [1] 0.84675
hist(cdc$weight)

boxplot(cdc$weight)
