source("http://www.openintro.org/stat/data/cdc.R")

names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
## [7] "wtdesire" "age"      "gender"
## exercise 1; 20 thousands observations; 9 variables; genhlth - categorical, exerany - discrete, hlthplan - discrete, smoke100 - discrete, height - discrete, weight - discrete, age - discrete, gender - categorical

summary(cdc)
##       genhlth        exerany          hlthplan         smoke100     
##  excellent:4657   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  very good:6972   1st Qu.:0.0000   1st Qu.:1.0000   1st Qu.:0.0000  
##  good     :5675   Median :1.0000   Median :1.0000   Median :0.0000  
##  fair     :2019   Mean   :0.7457   Mean   :0.8738   Mean   :0.4721  
##  poor     : 677   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##                   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##      height          weight         wtdesire          age        gender   
##  Min.   :48.00   Min.   : 68.0   Min.   : 68.0   Min.   :18.00   m: 9569  
##  1st Qu.:64.00   1st Qu.:140.0   1st Qu.:130.0   1st Qu.:31.00   f:10431  
##  Median :67.00   Median :165.0   Median :150.0   Median :43.00            
##  Mean   :67.18   Mean   :169.7   Mean   :155.1   Mean   :45.07            
##  3rd Qu.:70.00   3rd Qu.:190.0   3rd Qu.:175.0   3rd Qu.:57.00            
##  Max.   :93.00   Max.   :500.0   Max.   :680.0   Max.   :99.00
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
tail(cdc)
##         genhlth exerany hlthplan smoke100 height weight wtdesire age
## 19995      good       0        1        1     69    224      224  73
## 19996      good       1        1        0     66    215      140  23
## 19997 excellent       0        1        0     73    200      185  35
## 19998      poor       0        1        0     65    216      150  57
## 19999      good       1        1        0     67    165      165  81
## 20000      good       1        1        1     69    170      165  83
##       gender
## 19995      m
## 19996      f
## 19997      m
## 19998      f
## 19999      f
## 20000      m
summary(cdc$weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0
190 - 140
## [1] 50
mean(cdc$weight)
## [1] 169.683
var(cdc$weight)
## [1] 1606.484
median(cdc$weight)
## [1] 165
table(cdc$smoke100)
## 
##     0     1 
## 10559  9441
table(cdc$smoke100)/20000
## 
##       0       1 
## 0.52795 0.47205
barplot(table(cdc$smoke100))

## exercise 2 beginning

summary(cdc$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00
IQR(cdc$height)
## [1] 6
summary(cdc$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   31.00   43.00   45.07   57.00   99.00
IQR(cdc$age)
## [1] 26
table(cdc$gender)
## 
##     m     f 
##  9569 10431
## male 9596

table(cdc$gender)/20000
## 
##       m       f 
## 0.47845 0.52155
table(cdc$exerany)/20000
## 
##      0      1 
## 0.2543 0.7457
table(cdc$genhlth)/20000
## 
## excellent very good      good      fair      poor 
##   0.23285   0.34860   0.28375   0.10095   0.03385
## excelent health 0.23285

table(cdc$gender,cdc$smoke100)
##    
##        0    1
##   m 4547 5022
##   f 6012 4419
mosaicplot(table(cdc$gender,cdc$smoke100))

## exercise 3; males have smoked more

dim(cdc)
## [1] 20000     9
cdc[567,6]
## [1] 160
names(cdc)
## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
## [7] "wtdesire" "age"      "gender"
cdc[1:10,6]
##  [1] 175 125 105 132 150 114 194 170 150 180
1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
cdc[1:10,]
##      genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1       good       0        1        0     70    175      175  77      m
## 2       good       0        1        1     64    125      115  33      f
## 3       good       1        1        1     60    105      105  49      f
## 4       good       1        1        0     66    132      124  42      f
## 5  very good       0        1        0     61    150      130  55      f
## 6  very good       1        1        0     64    114      114  55      f
## 7  very good       1        1        0     71    194      185  31      m
## 8  very good       0        1        0     67    170      160  45      m
## 9       good       0        1        1     65    150      130  27      f
## 10      good       1        1        0     70    180      170  44      m
cdc$weight[567]
## [1] 160
mdata <- subset(cdc, cdc$gender == "m")

head(mdata)
##      genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1       good       0        1        0     70    175      175  77      m
## 7  very good       1        1        0     71    194      185  31      m
## 8  very good       0        1        0     67    170      160  45      m
## 10      good       1        1        0     70    180      170  44      m
## 11 excellent       1        1        1     69    186      175  46      m
## 12      fair       1        1        1     69    168      148  62      m
m_and_over30 <- subset(cdc, gender == "m" & age > 30)

m_or_over30 <- subset(cdc, gender == "m" | age > 30)


## exercise 4; 

under23_and_smoke <- subset(cdc, smoke100 == 1 | age < 23)

boxplot(cdc$height)

summary(cdc$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00
bmi <- (cdc$weight / cdc$height^2) * 703

boxplot(bmi ~ cdc$genhlth)

## exercise 5; people with excelent health have lower bmi vs people with poor health (specially poor health people have a lot outliers with very high BMI)

boxplot(bmi ~ cdc$exerany)

## people who exercise have slightly lower BMI on average

hist(cdc$age)

hist(bmi)

hist(bmi, breaks = 50)

## On my own


## 1. The 2 variables have strong correlation

plot(cdc$wtdesire, cdc$weight, main="Weight desired vs weight", 
    xlab="Weight desired", ylab="weight ")

## 2. 

cdc$wdiff=cdc$wtdesire-cdc$weight

head(cdc$wdiff)
## [1]   0 -10   0  -8 -20   0
## 3. it is discrete. 0 means that actual and desired weights match. positive means that a person potentially wants to gain wait. negative means that a person potentially wants to lose weight


## 4. Distribution is left skewed. Median is bigger than mean as expected for left skewed distribution. IQR is 21, so half of the people either do not want to lse any weight or want to lose up to 20 lbs, 25% want to gain weight, and another 25% want to lose more than 21 lbs. Much more people want to lose weight than to gain.

summary(cdc$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -300.00  -21.00  -10.00  -14.59    0.00  500.00
table(cdc$wdiff)
## 
## -300 -246 -235 -220 -210 -200 -190 -180 -175 -170 -165 -160 -155 -152 -150 
##    2    1    1    1    1    6    1    1    1    4    2    3    1    4   17 
## -148 -147 -145 -142 -140 -139 -135 -133 -132 -130 -128 -126 -125 -122 -120 
##    1    1    4    2    8    1    4    1    3   15    1    2    9    2   27 
## -117 -115 -113 -112 -110 -109 -108 -107 -105 -103 -100  -99  -98  -97  -96 
##    1   11    3    3   16    3    3    1   12    1  115    1    3    2    1 
##  -95  -94  -93  -92  -91  -90  -88  -87  -86  -85  -84  -83  -82  -81  -80 
##   11    2    2    2    1   46    6    2    5   22    3    4    5    2   72 
##  -79  -78  -77  -76  -75  -74  -73  -72  -71  -70  -69  -68  -67  -66  -65 
##    4    6    5    3   43    6    5    8    2  125    6    9    5    9   62 
##  -64  -63  -62  -61  -60  -59  -58  -57  -56  -55  -54  -53  -52  -51  -50 
##    8    8   10    4  184    8   15    8   12  101   13   11   14    4  395 
##  -49  -48  -47  -46  -45  -44  -43  -42  -41  -40  -39  -38  -37  -36  -35 
##   12   23   18   13  189   18   12   21    8  556   18   31   34   23  360 
##  -34  -33  -32  -31  -30  -29  -28  -27  -26  -25  -24  -23  -22  -21  -20 
##   32   30   28   22  893   36   70   65   26  677   47   70   74   41 1467 
##  -19  -18  -17  -16  -15  -14  -13  -12  -11  -10   -9   -8   -7   -6   -5 
##   43  141   69   73 1173   71  157  171   64 1856  103  234  234  132 1253 
##   -4   -3   -2   -1    0    1    2    3    4    5    6    7    8    9   10 
##  131  188  125   51 5616   19   39   33   21  232   34   58   43   17  333 
##   11   12   13   14   15   16   17   18   19   20   21   22   23   24   25 
##   20   32   23    8  208    5   13   14    9  151    2   17    9    5   79 
##   26   27   28   29   30   31   32   33   35   36   37   38   39   40   41 
##    3    9    9    1   46    2    5    6   26    3    3    1    1   21    3 
##   42   43   45   47   50   52   53   55   60   61   63   64   65   68   70 
##    2    1    7    1    9    1    1    2    6    1    1    2    5    1    3 
##   72   73   75   80   83   85   86   90   91  110  311  500 
##    1    1    1    2    1    2    1    1    1    1    1    1
table(cdc$wdiff)/20000
## 
##    -300    -246    -235    -220    -210    -200    -190    -180    -175 
## 0.00010 0.00005 0.00005 0.00005 0.00005 0.00030 0.00005 0.00005 0.00005 
##    -170    -165    -160    -155    -152    -150    -148    -147    -145 
## 0.00020 0.00010 0.00015 0.00005 0.00020 0.00085 0.00005 0.00005 0.00020 
##    -142    -140    -139    -135    -133    -132    -130    -128    -126 
## 0.00010 0.00040 0.00005 0.00020 0.00005 0.00015 0.00075 0.00005 0.00010 
##    -125    -122    -120    -117    -115    -113    -112    -110    -109 
## 0.00045 0.00010 0.00135 0.00005 0.00055 0.00015 0.00015 0.00080 0.00015 
##    -108    -107    -105    -103    -100     -99     -98     -97     -96 
## 0.00015 0.00005 0.00060 0.00005 0.00575 0.00005 0.00015 0.00010 0.00005 
##     -95     -94     -93     -92     -91     -90     -88     -87     -86 
## 0.00055 0.00010 0.00010 0.00010 0.00005 0.00230 0.00030 0.00010 0.00025 
##     -85     -84     -83     -82     -81     -80     -79     -78     -77 
## 0.00110 0.00015 0.00020 0.00025 0.00010 0.00360 0.00020 0.00030 0.00025 
##     -76     -75     -74     -73     -72     -71     -70     -69     -68 
## 0.00015 0.00215 0.00030 0.00025 0.00040 0.00010 0.00625 0.00030 0.00045 
##     -67     -66     -65     -64     -63     -62     -61     -60     -59 
## 0.00025 0.00045 0.00310 0.00040 0.00040 0.00050 0.00020 0.00920 0.00040 
##     -58     -57     -56     -55     -54     -53     -52     -51     -50 
## 0.00075 0.00040 0.00060 0.00505 0.00065 0.00055 0.00070 0.00020 0.01975 
##     -49     -48     -47     -46     -45     -44     -43     -42     -41 
## 0.00060 0.00115 0.00090 0.00065 0.00945 0.00090 0.00060 0.00105 0.00040 
##     -40     -39     -38     -37     -36     -35     -34     -33     -32 
## 0.02780 0.00090 0.00155 0.00170 0.00115 0.01800 0.00160 0.00150 0.00140 
##     -31     -30     -29     -28     -27     -26     -25     -24     -23 
## 0.00110 0.04465 0.00180 0.00350 0.00325 0.00130 0.03385 0.00235 0.00350 
##     -22     -21     -20     -19     -18     -17     -16     -15     -14 
## 0.00370 0.00205 0.07335 0.00215 0.00705 0.00345 0.00365 0.05865 0.00355 
##     -13     -12     -11     -10      -9      -8      -7      -6      -5 
## 0.00785 0.00855 0.00320 0.09280 0.00515 0.01170 0.01170 0.00660 0.06265 
##      -4      -3      -2      -1       0       1       2       3       4 
## 0.00655 0.00940 0.00625 0.00255 0.28080 0.00095 0.00195 0.00165 0.00105 
##       5       6       7       8       9      10      11      12      13 
## 0.01160 0.00170 0.00290 0.00215 0.00085 0.01665 0.00100 0.00160 0.00115 
##      14      15      16      17      18      19      20      21      22 
## 0.00040 0.01040 0.00025 0.00065 0.00070 0.00045 0.00755 0.00010 0.00085 
##      23      24      25      26      27      28      29      30      31 
## 0.00045 0.00025 0.00395 0.00015 0.00045 0.00045 0.00005 0.00230 0.00010 
##      32      33      35      36      37      38      39      40      41 
## 0.00025 0.00030 0.00130 0.00015 0.00015 0.00005 0.00005 0.00105 0.00015 
##      42      43      45      47      50      52      53      55      60 
## 0.00010 0.00005 0.00035 0.00005 0.00045 0.00005 0.00005 0.00010 0.00030 
##      61      63      64      65      68      70      72      73      75 
## 0.00005 0.00005 0.00010 0.00025 0.00005 0.00015 0.00005 0.00005 0.00005 
##      80      83      85      86      90      91     110     311     500 
## 0.00010 0.00005 0.00010 0.00005 0.00005 0.00005 0.00005 0.00005 0.00005
(var(cdc$wdiff))^0.5
## [1] 24.04586
barplot(table(cdc$wdiff))

IQR(cdc$wdiff)
## [1] 21
boxplot(cdc$wdiff)

## 5. It seems that more men want to gain weight than women. Women prefer to lose more weight. Men distribution is right skewed, while women distribution is left skewed

boxplot(cdc$wdiff ~ cdc$gender)

m <- subset(cdc, gender == "m")

f <- subset(cdc, gender == "f")

hist(m$wdiff)

hist(f$wdiff)

summary(m$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -300.00  -20.00   -5.00  -10.71    0.00  500.00
summary(f$wdiff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -300.00  -27.00  -10.00  -18.15    0.00   83.00
## 6 84.7%


mean(cdc$weight)
## [1] 169.683
var(cdc$weight)^0.5
## [1] 40.08097
mysubset <- subset(cdc, weight <=(169.683+40.08097)|weight <=(169.683-40.08097))

head(mysubset)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
##   wdiff
## 1     0
## 2   -10
## 3     0
## 4    -8
## 5   -20
## 6     0
16935/20000
## [1] 0.84675
hist(cdc$weight)

boxplot(cdc$weight)