## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = TRUE, fig.width = 10, fig.height = 8)
options(width = 116, scipen = 10)
setwd("~/statistics/bio509/")
## Remove everything first
rm(list = ls())
## Load data
load("./lungcancer.RData")
## 1. LiSt objects
ls()
[1] "FIPS" "county"
## 2. row names and column names at the same time (row then col)
dimnames(county)
[[1]]
[1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22"
[23] "23" "24" "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44"
[45] "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66"
[67] "67"
[[2]]
[1] "COUNTY" "N" "LC" "PM2.5" "state" "smoking_rate"
## 3. Check first part of county
head(county, 10)
COUNTY N LC PM2.5 state smoking_rate
1 9001 75308 1076 13.198 CT 0.4689
2 9003 84555 1013 10.828 CT 0.4740
3 9005 19690 241 8.950 CT 0.5196
4 9007 16475 203 10.120 CT 0.5375
5 9009 78439 1129 14.504 CT 0.5032
6 9011 30761 487 10.881 CT 0.5322
7 9013 10722 155 8.991 CT 0.5074
8 9015 12309 184 9.062 CT 0.5383
9 23001 13238 210 9.648 ME 0.5542
10 23003 11494 162 9.321 ME 0.5880
## 4. Sort by PM2.5 using orderBy in package doBy
library(doBy)
mydata <- orderBy(formula = ~ +PM2.5, data = county)
## 5. 5th and 95th percentile
quantile(county$PM2.5, probs = c(0.05, 0.95))
5% 95%
5.997 13.029
## 6. mean of each column
colMeans(county[, names(county) != "state"])
COUNTY N LC PM2.5 smoking_rate
30460.2537 20190.0000 288.9701 9.0094 0.5316
colMeans(county[, sapply(county, is.factor) == FALSE])
COUNTY N LC PM2.5 smoking_rate
30460.2537 20190.0000 288.9701 9.0094 0.5316
## 7. LC by state
library(plyr)
ddply(county, "state", summarise, total.LC = sum(LC))
state total.LC
1 CT 4488
2 MA 8487
3 ME 2575
4 NH 1762
5 RI 1247
6 VT 802
library(doBy)
summaryBy(LC ~ state, county, FUN = sum)
state LC.sum
1 CT 4488
2 MA 8487
3 ME 2575
4 NH 1762
5 RI 1247
6 VT 802
## 8. Restrict mydata to state of ME
subset(mydata, state == "ME")
COUNTY N LC PM2.5 state smoking_rate
23 23029 5645 113 4.886 ME 0.6387
19 23021 2822 39 5.895 ME 0.5563
13 23009 7576 108 5.966 ME 0.5538
15 23013 6160 81 6.183 ME 0.5092
17 23017 7801 116 6.273 ME 0.5882
22 23027 4573 69 6.577 ME 0.5747
12 23007 3869 43 6.694 ME 0.5235
21 23025 6670 94 6.914 ME 0.5879
16 23015 5705 80 6.992 ME 0.5778
20 23023 3890 52 7.307 ME 0.5945
18 23019 17685 333 9.007 ME 0.5495
10 23003 11494 162 9.321 ME 0.5880
24 23031 22576 360 9.422 ME 0.5645
9 23001 13238 210 9.648 ME 0.5542
14 23011 15211 216 9.991 ME 0.5245
11 23005 32151 499 10.434 ME 0.5430
## 9. Create vector Y by diving LC by N
Y <- with(county, LC / N)
Y
[1] 0.014288 0.011980 0.012240 0.012322 0.014393 0.015832 0.014456 0.014948 0.015863 0.014094 0.015521 0.011114
[13] 0.014256 0.014200 0.013149 0.014023 0.014870 0.018830 0.013820 0.013368 0.014093 0.015089 0.020018 0.015946
[25] 0.014885 0.014534 0.014972 0.012712 0.015300 0.011078 0.012353 0.013009 0.015096 0.008299 0.017382 0.015981
[37] 0.016860 0.013374 0.016177 0.014452 0.012759 0.011077 0.009364 0.013825 0.009782 0.015854 0.013688 0.009490
[49] 0.012672 0.014905 0.016030 0.013337 0.014791 0.008495 0.010326 0.011822 0.012391 0.008395 0.015599 0.007772
[61] 0.012647 0.011923 0.008262 0.015882 0.008348 0.010016 0.009190
## 10. Matrix smoking rate, PM2.5, and 1's only
X <- cbind(county[,c("smoking_rate", "PM2.5")], one = 1)
X <- as.matrix(X)
class(X)
[1] "matrix"
## 11.
## a.
solve(t(X) %*% X) %*% t(X) %*% Y
[,1]
smoking_rate 0.0183799
PM2.5 0.0003519
one 0.0003993
## b.
lm(Y ~ smoking_rate + PM2.5, data = county)
Call:
lm(formula = Y ~ smoking_rate + PM2.5, data = county)
Coefficients:
(Intercept) smoking_rate PM2.5
0.000399 0.018380 0.000352
This was too much of math for me…