We will be analyzing the data set named us_contagious_diseases in the R package dslabs. In the us_contagious_dieases, answer the following questions. Please submit both the .Rmd and knitted .html files.
#install.packages("dslabs")
library(dslabs)
d <- us_contagious_diseases
str(d)
## 'data.frame': 16065 obs. of 6 variables:
## $ disease : Factor w/ 7 levels "Hepatitis A",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ state : Factor w/ 51 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 1966 1967 1968 1969 1970 ...
## $ weeks_reporting: num 50 49 52 49 51 51 45 45 45 46 ...
## $ count : num 321 291 314 380 413 378 342 467 244 286 ...
## $ population : num 3345787 3364130 3386068 3412450 3444165 ...
table(d$disease)
##
## Hepatitis A Measles Mumps Pertussis Polio Rubella
## 2346 3825 1785 2856 2091 1887
## Smallpox
## 1275
(b). Compute the frequency of each type of disease, and the 0.15 quantile of the population.
quantile(d$population,0.15,na.rm = TRUE)
## 15%
## 670515
Find the top 5 states with the most Mumps cases over the 10 years from 1991 to 2000 (both years inclusive).
d %>%
filter (year >= 1991, year <= 2000, disease == "Mumps") %>%
arrange(desc(count)) %>%
head(5)
## disease state year weeks_reporting count population
## 1 Mumps California 1991 49 389 30311890
## 2 Mumps South Carolina 1991 44 384 3527239
## 3 Mumps Texas 1994 46 378 18376501
## 4 Mumps Florida 1991 48 359 13246692
## 5 Mumps Texas 1991 48 340 17305041
For the state of Texas,
ave_count, representing the average count per weeks_reporting,d %>%
filter (state == "Texas") %>%
select (count, weeks_reporting) %>%
mutate (ave_count = count/weeks_reporting)
## count weeks_reporting ave_count
## 1 1808 52 3.476923e+01
## 2 2727 51 5.347059e+01
## 3 2190 50 4.380000e+01
## 4 2312 50 4.624000e+01
## 5 2741 51 5.374510e+01
## 6 3731 51 7.315686e+01
## 7 3407 46 7.406522e+01
## 8 4569 48 9.518750e+01
## 9 3200 43 7.441860e+01
## 10 2845 49 5.806122e+01
## 11 1531 41 3.734146e+01
## 12 2053 51 4.025490e+01
## 13 2578 50 5.156000e+01
## 14 3278 51 6.427451e+01
## 15 2066 38 5.436842e+01
## 16 2470 47 5.255319e+01
## 17 3088 50 6.176000e+01
## 18 2839 50 5.678000e+01
## 19 2014 45 4.475556e+01
## 20 1965 43 4.569767e+01
## 21 1443 39 3.700000e+01
## 22 1690 50 3.380000e+01
## 23 1020 22 4.636364e+01
## 24 1965 32 6.140625e+01
## 25 1886 40 4.715000e+01
## 26 1825 35 5.214286e+01
## 27 1089 26 4.188462e+01
## 28 1319 25 5.276000e+01
## 29 1651 27 6.114815e+01
## 30 2062 39 5.287179e+01
## 31 2555 43 5.941860e+01
## 32 1994 24 8.308333e+01
## 33 2540 34 7.470588e+01
## 34 1776 36 4.933333e+01
## 35 1016 39 2.605128e+01
## 36 971 36 2.697222e+01
## 37 424 46 9.217391e+00
## 38 397 39 1.017949e+01
## 39 558 41 1.360976e+01
## 40 161 48 3.354167e+00
## 41 147 46 3.195652e+00
## 42 71 43 1.651163e+00
## 43 61 48 1.270833e+00
## 44 59 49 1.204082e+00
## 45 93 50 1.860000e+00
## 46 91 49 1.857143e+00
## 47 5524 49 1.127347e+02
## 48 4107 52 7.898077e+01
## 49 4275 51 8.382353e+01
## 50 2336 44 5.309091e+01
## 51 4564 48 9.508333e+01
## 52 21734 52 4.179615e+02
## 53 29699 52 5.711346e+02
## 54 3923 50 7.846000e+01
## 55 7776 49 1.586939e+02
## 56 14694 51 2.881176e+02
## 57 6011 52 1.155962e+02
## 58 7380 51 1.447059e+02
## 59 16060 48 3.345833e+02
## 60 20305 47 4.320213e+02
## 61 34717 50 6.943400e+02
## 62 16370 47 3.482979e+02
## 63 37283 47 7.932553e+02
## 64 12892 48 2.685833e+02
## 65 29521 52 5.677115e+02
## 66 8090 50 1.618000e+02
## 67 44525 44 1.011932e+03
## 68 57133 52 1.098712e+03
## 69 13658 50 2.731600e+02
## 70 65601 49 1.338796e+03
## 71 23560 48 4.908333e+02
## 72 73840 48 1.538333e+03
## 73 75953 50 1.519060e+03
## 74 39466 49 8.054286e+02
## 75 66685 52 1.282404e+03
## 76 42797 48 8.916042e+02
## 77 82299 50 1.645980e+03
## 78 33917 48 7.066042e+02
## 79 49678 52 9.553462e+02
## 80 16111 51 3.159020e+02
## 81 66915 52 1.286827e+03
## 82 16697 48 3.478542e+02
## 83 39353 37 1.063595e+03
## 84 29509 52 5.674808e+02
## 85 25178 51 4.936863e+02
## 86 13283 52 2.554423e+02
## 87 5146 52 9.896154e+01
## 88 4957 51 9.719608e+01
## 89 8327 51 1.632745e+02
## 90 9101 51 1.784510e+02
## 91 1582 50 3.164000e+01
## 92 500 51 9.803922e+00
## 93 179 48 3.729167e+00
## 94 281 50 5.620000e+00
## 95 262 52 5.038462e+00
## 96 1889 49 3.855102e+01
## 97 1047 51 2.052941e+01
## 98 648 52 1.246154e+01
## 99 197 51 3.862745e+00
## 100 924 48 1.925000e+01
## 101 175 46 3.804348e+00
## 102 37 43 8.604651e-01
## 103 543 45 1.206667e+01
## 104 423 44 9.613636e+00
## 105 372 43 8.651163e+00
## 106 417 47 8.872340e+00
## 107 14 39 3.589744e-01
## 108 3129 46 6.802174e+01
## 109 3898 43 9.065116e+01
## 110 214 32 6.687500e+00
## 111 1002 32 3.131250e+01
## 112 10 22 4.545455e-01
## 113 16 41 3.902439e-01
## 114 13 22 5.909091e-01
## 115 27 38 7.105263e-01
## 116 9 39 2.307692e-01
## 117 0 0 NaN
## 118 7 46 1.521739e-01
## 119 0 0 NaN
## 120 0 39 0.000000e+00
## 121 1 20 5.000000e-02
## 122 11236 49 2.293061e+02
## 123 8070 51 1.582353e+02
## 124 5972 51 1.170980e+02
## 125 8815 51 1.728431e+02
## 126 4924 50 9.848000e+01
## 127 3672 51 7.200000e+01
## 128 3150 50 6.300000e+01
## 129 3890 52 7.480769e+01
## 130 1735 52 3.336538e+01
## 131 979 51 1.919608e+01
## 132 1527 52 2.936538e+01
## 133 906 52 1.742308e+01
## 134 208 51 4.078431e+00
## 135 216 48 4.500000e+00
## 136 257 52 4.942308e+00
## 137 217 50 4.340000e+00
## 138 181 50 3.620000e+00
## 139 315 51 6.176471e+00
## 140 216 49 4.408163e+00
## 141 317 51 6.215686e+00
## 142 294 51 5.764706e+00
## 143 596 52 1.146154e+01
## 144 456 49 9.306122e+00
## 145 340 48 7.083333e+00
## 146 299 41 7.292683e+00
## 147 226 49 4.612245e+00
## 148 378 46 8.217391e+00
## 149 27 46 5.869565e-01
## 150 24 37 6.486486e-01
## 151 47 47 1.000000e+00
## 152 31 43 7.209302e-01
## 153 23 47 4.893617e-01
## 154 21 34 6.176471e-01
## 155 12 37 3.243243e-01
## 156 14 41 3.414634e-01
## 157 9909 51 1.942941e+02
## 158 5535 52 1.064423e+02
## 159 8214 51 1.610588e+02
## 160 10667 51 2.091569e+02
## 161 7750 51 1.519608e+02
## 162 16131 52 3.102115e+02
## 163 10178 52 1.957308e+02
## 164 10263 52 1.973654e+02
## 165 8439 52 1.622885e+02
## 166 20636 51 4.046275e+02
## 167 12121 50 2.424200e+02
## 168 5279 52 1.015192e+02
## 169 11248 50 2.249600e+02
## 170 12796 48 2.665833e+02
## 171 11044 48 2.300833e+02
## 172 4489 48 9.352083e+01
## 173 5785 51 1.134314e+02
## 174 10133 51 1.986863e+02
## 175 143 32 4.468750e+00
## 176 141 36 3.916667e+00
## 177 36 24 1.500000e+00
## 178 83 35 2.371429e+00
## 179 132 41 3.219512e+00
## 180 110 40 2.750000e+00
## 181 77 38 2.026316e+00
## 182 84 34 2.470588e+00
## 183 71 36 1.972222e+00
## 184 98 48 2.041667e+00
## 185 61 44 1.386364e+00
## 186 351 30 1.170000e+01
## 187 89 28 3.178571e+00
## 188 78 16 4.875000e+00
## 189 119 33 3.606061e+00
## 190 250 24 1.041667e+01
## 191 82 18 4.555556e+00
## 192 81 23 3.521739e+00
## 193 34 7 4.857143e+00
## 194 83 34 2.441176e+00
## 195 121 44 2.750000e+00
## 196 126 47 2.680851e+00
## 197 77 39 1.974359e+00
## 198 109 47 2.319149e+00
## 199 157 46 3.413043e+00
## 200 100 48 2.083333e+00
## 201 152 34 4.470588e+00
## 202 222 40 5.550000e+00
## 203 974 45 2.164444e+01
## 204 754 37 2.037838e+01
## 205 831 41 2.026829e+01
## 206 1478 48 3.079167e+01
## 207 387 47 8.234043e+00
## 208 337 46 7.326087e+00
## 209 337 50 6.740000e+00
## 210 867 51 1.700000e+01
## 211 1255 51 2.460784e+01
## 212 522 50 1.044000e+01
## 213 38 52 7.307692e-01
## 214 18 52 3.461538e-01
## 215 128 51 2.509804e+00
## 216 34 51 6.666667e-01
## 217 72 52 1.384615e+00
## 218 47 52 9.038462e-01
## 219 151 52 2.903846e+00
## 220 82 52 1.576923e+00
## 221 68 52 1.307692e+00
## 222 665 52 1.278846e+01
## 223 238 50 4.760000e+00
## 224 213 51 4.176471e+00
## 225 175 47 3.723404e+00
## 226 114 49 2.326531e+00
## 227 235 48 4.895833e+00
## 228 952 46 2.069565e+01
## 229 374 47 7.957447e+00
## 230 821 46 1.784783e+01
## 231 961 50 1.922000e+01
## 232 230 51 4.509804e+00
## 233 1357 49 2.769388e+01
## 234 2313 52 4.448077e+01
## 235 2747 50 5.494000e+01
## 236 1924 49 3.926531e+01
## 237 3498 49 7.138776e+01
## 238 1698 48 3.537500e+01
## 239 3087 51 6.052941e+01
## 240 1819 49 3.712245e+01
## 241 1355 52 2.605769e+01
## 242 708 51 1.388235e+01
## 243 614 50 1.228000e+01
## 244 378 40 9.450000e+00
## 245 189 49 3.857143e+00
## 246 75 47 1.595745e+00
## 247 207 49 4.224490e+00
## 248 8 47 1.702128e-01
## 249 10 37 2.702703e-01
## 250 16 45 3.555556e-01
## 251 68 38 1.789474e+00
## 252 8 42 1.904762e-01
## 253 22 44 5.000000e-01
## 254 138 29 4.758621e+00
## 255 622 31 2.006452e+01
## 256 2867 49 5.851020e+01
## 257 4179 51 8.194118e+01
## 258 8323 51 1.631961e+02
## 259 4363 52 8.390385e+01
## 260 1502 49 3.065306e+01
## 261 1122 50 2.244000e+01
## 262 259 49 5.285714e+00
## 263 353 51 6.921569e+00
## 264 271 52 5.211538e+00
## 265 818 51 1.603922e+01
## 266 388 51 7.607843e+00
## 267 206 52 3.961538e+00
## 268 133 50 2.660000e+00
## 269 168 47 3.574468e+00
## 270 136 50 2.720000e+00
## 271 123 50 2.460000e+00
## 272 74 50 1.480000e+00
## 273 40 44 9.090909e-01
## 274 68 43 1.581395e+00
## 275 4 33 1.212121e-01
## 276 19 32 5.937500e-01
## 277 44 46 9.565217e-01
## 278 87 18 4.833333e+00
## 279 6 24 2.500000e-01
## 280 0 0 NaN
## 281 9 37 2.432432e-01
## 282 9 34 2.647059e-01
## 283 13 48 2.708333e-01
## 284 2 33 6.060606e-02
## 285 5 36 1.388889e-01
## 286 62 44 1.409091e+00
## 287 8 43 1.860465e-01
## 288 5 34 1.470588e-01
## 289 1 11 9.090909e-02
## 290 1 30 3.333333e-02
## 291 1793 51 3.515686e+01
## 292 2442 52 4.696154e+01
## 293 2736 52 5.261538e+01
## 294 1799 51 3.527451e+01
## 295 1068 52 2.053846e+01
## 296 822 52 1.580769e+01
## 297 883 52 1.698077e+01
## 298 547 52 1.051923e+01
## 299 78 51 1.529412e+00
## 300 171 52 3.288462e+00
## 301 597 52 1.148077e+01
## 302 532 52 1.023077e+01
## 303 236 51 4.627451e+00
## 304 52 51 1.019608e+00
## 305 125 52 2.403846e+00
## 306 84 52 1.615385e+00
## 307 56 51 1.098039e+00
## 308 14 52 2.692308e-01
## 309 26 52 5.000000e-01
## 310 15 49 3.061224e-01
## 311 83 27 3.074074e+00
## 312 8 4 2.000000e+00
## 313 1 1 1.000000e+00
## 314 0 0 NaN
## 315 1 1 1.000000e+00
year (x-axis) and ave_count (y-axis), using different colors for different diseases.d %>%
filter(state == "Texas") %>%
mutate(ave_count = count/weeks_reporting) %>%
ggplot(mapping = aes(x = year, y = ave_count, color = disease)) + geom_point() +geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
c <-which(d$disease=="Measles")
without_m <- d[-c,]
without_m %>%
filter(state == "Texas") %>%
mutate(ave_count = count/weeks_reporting) %>%
ggplot(mapping = aes(x = year, y = ave_count, color = disease)) + geom_point() +geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_point).
d %>%
group_by(year,state) %>%
summarize(total_count= sum(count))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
## # A tibble: 4,284 × 3
## # Groups: year [84]
## year state total_count
## <dbl> <fct> <dbl>
## 1 1928 Alabama 9246
## 2 1928 Alaska 0
## 3 1928 Arizona 1268
## 4 1928 Arkansas 9157
## 5 1928 California 4960
## 6 1928 Colorado 2510
## 7 1928 Connecticut 10247
## 8 1928 Delaware 607
## 9 1928 District Of Columbia 2609
## 10 1928 Florida 1892
## # … with 4,274 more rows
d %>%
group_by(state, year) %>%
summarize(count_density = sum(count) / sum(population))
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
## # A tibble: 4,284 × 3
## # Groups: state [51]
## state year count_density
## <fct> <dbl> <dbl>
## 1 Alabama 1928 0.00119
## 2 Alabama 1929 0.000432
## 3 Alabama 1930 0.000556
## 4 Alabama 1931 0.00116
## 5 Alabama 1932 0.0000956
## 6 Alabama 1933 0.000227
## 7 Alabama 1934 0.00194
## 8 Alabama 1935 0.000887
## 9 Alabama 1936 0.000121
## 10 Alabama 1937 0.0000908
## # … with 4,274 more rows
d %>%
group_by(state, year) %>%
summarize(count_density = sum(count) / sum(population))%>%
arrange(count_density)
## `summarise()` has grouped output by 'state'. You can override using the `.groups` argument.
## # A tibble: 4,284 × 3
## # Groups: state [51]
## state year count_density
## <fct> <dbl> <dbl>
## 1 Delaware 2001 0
## 2 Kansas 1962 0
## 3 Kansas 1963 0
## 4 Nevada 1928 0
## 5 Nevada 1929 0
## 6 Nevada 1930 0
## 7 Nevada 1931 0
## 8 Nevada 1932 0
## 9 Nevada 1933 0
## 10 Nevada 1934 0
## # … with 4,274 more rows