BIO 509 Examples

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE, 
    echo = TRUE, fig.width = 10, fig.height = 8)
options(width = 116, scipen = 10)

setwd("~/statistics/bio509/")

Examples 1

## Remove everything first
rm(list = ls())

## Load data
load("./lungcancer.RData")

## 1. LiSt objects
ls()
[1] "FIPS"   "county"

## 2. row names and column names at the same time (row then col)
dimnames(county)
[[1]]
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22"
[23] "23" "24" "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" "37" "38" "39" "40" "41" "42" "43" "44"
[45] "45" "46" "47" "48" "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60" "61" "62" "63" "64" "65" "66"
[67] "67"

[[2]]
[1] "COUNTY"       "N"            "LC"           "PM2.5"        "state"        "smoking_rate"


## 3. Check first part of county
head(county, 10)
   COUNTY     N   LC  PM2.5 state smoking_rate
1    9001 75308 1076 13.198    CT       0.4689
2    9003 84555 1013 10.828    CT       0.4740
3    9005 19690  241  8.950    CT       0.5196
4    9007 16475  203 10.120    CT       0.5375
5    9009 78439 1129 14.504    CT       0.5032
6    9011 30761  487 10.881    CT       0.5322
7    9013 10722  155  8.991    CT       0.5074
8    9015 12309  184  9.062    CT       0.5383
9   23001 13238  210  9.648    ME       0.5542
10  23003 11494  162  9.321    ME       0.5880

## 4. Sort by PM2.5 using orderBy in package doBy
library(doBy)
mydata <- orderBy(formula = ~ +PM2.5, data = county)

## 5. 5th and 95th percentile
quantile(county$PM2.5, probs = c(0.05, 0.95))
    5%    95% 
 5.997 13.029 

## 6. mean of each column
colMeans(county[, names(county) != "state"])
      COUNTY            N           LC        PM2.5 smoking_rate 
  30460.2537   20190.0000     288.9701       9.0094       0.5316 
colMeans(county[, sapply(county, is.factor) == FALSE])
      COUNTY            N           LC        PM2.5 smoking_rate 
  30460.2537   20190.0000     288.9701       9.0094       0.5316 

## 7. LC by state
library(plyr)
ddply(county, "state", summarise, total.LC = sum(LC))
  state total.LC
1    CT     4488
2    MA     8487
3    ME     2575
4    NH     1762
5    RI     1247
6    VT      802

library(doBy)
summaryBy(LC ~ state, county, FUN = sum)
  state LC.sum
1    CT   4488
2    MA   8487
3    ME   2575
4    NH   1762
5    RI   1247
6    VT    802

## 8. Restrict mydata to state of ME
subset(mydata, state == "ME")
   COUNTY     N  LC  PM2.5 state smoking_rate
23  23029  5645 113  4.886    ME       0.6387
19  23021  2822  39  5.895    ME       0.5563
13  23009  7576 108  5.966    ME       0.5538
15  23013  6160  81  6.183    ME       0.5092
17  23017  7801 116  6.273    ME       0.5882
22  23027  4573  69  6.577    ME       0.5747
12  23007  3869  43  6.694    ME       0.5235
21  23025  6670  94  6.914    ME       0.5879
16  23015  5705  80  6.992    ME       0.5778
20  23023  3890  52  7.307    ME       0.5945
18  23019 17685 333  9.007    ME       0.5495
10  23003 11494 162  9.321    ME       0.5880
24  23031 22576 360  9.422    ME       0.5645
9   23001 13238 210  9.648    ME       0.5542
14  23011 15211 216  9.991    ME       0.5245
11  23005 32151 499 10.434    ME       0.5430

## 9. Create vector Y by diving LC by N
Y <- with(county, LC / N)
Y
 [1] 0.014288 0.011980 0.012240 0.012322 0.014393 0.015832 0.014456 0.014948 0.015863 0.014094 0.015521 0.011114
[13] 0.014256 0.014200 0.013149 0.014023 0.014870 0.018830 0.013820 0.013368 0.014093 0.015089 0.020018 0.015946
[25] 0.014885 0.014534 0.014972 0.012712 0.015300 0.011078 0.012353 0.013009 0.015096 0.008299 0.017382 0.015981
[37] 0.016860 0.013374 0.016177 0.014452 0.012759 0.011077 0.009364 0.013825 0.009782 0.015854 0.013688 0.009490
[49] 0.012672 0.014905 0.016030 0.013337 0.014791 0.008495 0.010326 0.011822 0.012391 0.008395 0.015599 0.007772
[61] 0.012647 0.011923 0.008262 0.015882 0.008348 0.010016 0.009190

## 10. Matrix smoking rate, PM2.5, and 1's only
X <- cbind(county[,c("smoking_rate", "PM2.5")], one = 1)
X <- as.matrix(X)
class(X)
[1] "matrix"

## 11.

## a.
solve(t(X) %*% X) %*% t(X) %*% Y
                  [,1]
smoking_rate 0.0183799
PM2.5        0.0003519
one          0.0003993

## b.
lm(Y ~ smoking_rate + PM2.5, data = county)

Call:
lm(formula = Y ~ smoking_rate + PM2.5, data = county)

Coefficients:
 (Intercept)  smoking_rate         PM2.5  
    0.000399      0.018380      0.000352  

Example 2

This was too much of math for me…