## Load dataset directly from the internet
lbw <- read.table("http://www.umass.edu/statdata/statdata/data/lowbwt.dat", head = T, skip = 4)
## Change variable names to lower cases
names(lbw) <- tolower(names(lbw))
## Recoding using within function
lbw <- within(lbw, {
## Relabel race: 1, 2, 3 -> White, Black, Other
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## Dichotomize ptl
preterm <- factor(ptl >= 1, levels = c(FALSE,TRUE), labels = c("0","1+"))
## You can alse use cut
## preterm <- cut(ptl, breaks = c(-Inf, 0, Inf), labels = c("0","1+"))
## Change 0,1 binary to No,Yes binary
smoke <- factor(smoke, levels = c(0,1), labels = c("No","Yes"))
ht <- factor(ht , levels = c(0,1), labels = c("No","Yes"))
ui <- factor(ui , levels = c(0,1), labels = c("No","Yes"))
low <- factor(low , levels = c(0,1), labels = c("No","Yes"))
## Categorize ftv (frequency of visit): 0 1 2 3 4 6 -> None(0), Normal(1-2), Many (>= 3)
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
## Make Normal the reference level
ftv.cat <- relevel(ftv.cat, ref = "Normal")
})
These functions take data, split it by specified variables, and apply a specified summary function.
## One categorical variable
table(lbw$smoke)
No Yes
115 74
## Two categorical variables
table(lbw$smoke, lbw$ftv.cat)
Normal None Many
No 54 55 6
Yes 23 45 6
## Two categorical variables (variable name included)
xtabs( ~ smoke + ftv.cat, data = lbw)
ftv.cat
smoke Normal None Many
No 54 55 6
Yes 23 45 6
## tapply can be used by measuring length of an arbitrary variable
with(lbw, tapply(age, list(smoke, ftv.cat), FUN = length))
Normal None Many
No 54 55 6
Yes 23 45 6
## Two categorical variables (variable name included)
tab.3d <- xtabs( ~ smoke + ftv.cat + low, data = lbw)
## Flat table
ftable(tab.3d)
low No Yes
smoke ftv.cat
No Normal 43 11
None 39 16
Many 4 2
Yes Normal 16 7
None 25 20
Many 3 3
Objects that are really single vectors (vectors, matrices, and arrays) can be indexed the following ways.
## Original
tab.3d
, , low = No
ftv.cat
smoke Normal None Many
No 43 39 4
Yes 16 25 3
, , low = Yes
ftv.cat
smoke Normal None Many
No 11 16 2
Yes 7 20 3
## No smokers only
tab.3d["No",,]
low
ftv.cat No Yes
Normal 43 11
None 39 16
Many 4 2
## Smokers only
tab.3d[2,,]
low
ftv.cat No Yes
Normal 16 7
None 25 20
Many 3 3
## Smokers who had low birth weight baby
tab.3d["Yes",, "Yes"]
Normal None Many
7 20 3
Objects that are multiple vectors (data frames and lists) can also be addressed by element names. Elements can be examined by str().
## Examine lbw
str(lbw)
'data.frame': 189 obs. of 14 variables:
$ id : int 85 86 87 88 89 91 92 93 94 95 ...
$ low : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ age : int 19 33 20 21 18 21 22 17 29 26 ...
$ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
$ race : int 2 3 1 1 1 3 1 3 1 1 ...
$ smoke : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 2 2 ...
$ ptl : int 0 0 0 0 0 0 0 0 0 0 ...
$ ht : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ ui : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 1 1 1 1 ...
$ ftv : int 0 3 1 2 0 0 1 1 1 0 ...
$ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
$ ftv.cat : Factor w/ 3 levels "Normal","None",..: 2 3 1 1 2 2 1 1 1 2 ...
$ preterm : Factor w/ 2 levels "0","1+": 1 1 1 1 1 1 1 1 1 1 ...
$ race.cat: Factor w/ 3 levels "White","Black",..: 2 3 1 1 1 3 1 3 1 1 ...
## Extract id element
lbw$id
[1] 85 86 87 88 89 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 111 112 113
[28] 114 115 116 117 118 119 120 121 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
[55] 142 143 144 145 146 147 148 149 150 151 154 155 156 159 160 161 162 163 164 166 167 168 169 170 172 173 174
[82] 175 176 177 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 195 196 197 199 200 201 202 203 204
[109] 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 4 10 11 13 15
[136] 16 17 18 19 20 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 40 42 43 44 45 46
[163] 47 49 50 51 52 54 56 57 59 60 61 62 63 65 67 68 69 71 75 76 77 78 79 81 82 83 84
## Model objects can work in similar ways (linear regression)
lm.full <- lm(bwt ~ age + smoke + ht, data = lbw)
## str(lm.full)
## Coefficients element in the model object
lm.full$coefficients
(Intercept) age smokeYes htYes
2824.67 10.93 -273.62 -424.47
## Examine summary objects
lm.summary <- summary(lm.full)
str(lm.summary)
List of 11
$ call : language lm(formula = bwt ~ age + smoke + ht, data = lbw)
$ terms :Classes 'terms', 'formula' length 3 bwt ~ age + smoke + ht
.. ..- attr(*, "variables")= language list(bwt, age, smoke, ht)
.. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:4] "bwt" "age" "smoke" "ht"
.. .. .. ..$ : chr [1:3] "age" "smoke" "ht"
.. ..- attr(*, "term.labels")= chr [1:3] "age" "smoke" "ht"
.. ..- attr(*, "order")= int [1:3] 1 1 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(bwt, age, smoke, ht)
.. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "numeric" "factor" "factor"
.. .. ..- attr(*, "names")= chr [1:4] "bwt" "age" "smoke" "ht"
$ residuals : Named num [1:189] -509 -634 -213 -187 -148 ...
..- attr(*, "names")= chr [1:189] "1" "2" "3" "4" ...
$ coefficients : num [1:4, 1:4] 2824.7 10.9 -273.6 -424.5 239.6 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:4] "(Intercept)" "age" "smokeYes" "htYes"
.. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
$ aliased : Named logi [1:4] FALSE FALSE FALSE FALSE
..- attr(*, "names")= chr [1:4] "(Intercept)" "age" "smokeYes" "htYes"
$ sigma : num 712
$ df : int [1:3] 4 185 4
$ r.squared : num 0.0627
$ adj.r.squared: num 0.0475
$ fstatistic : Named num [1:3] 4.12 3 185
..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
$ cov.unscaled : num [1:4, 1:4] 0.1134 -0.00445 -0.01079 -0.00689 -0.00445 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : chr [1:4] "(Intercept)" "age" "smokeYes" "htYes"
.. ..$ : chr [1:4] "(Intercept)" "age" "smokeYes" "htYes"
- attr(*, "class")= chr "summary.lm"
## Extract coefficients element in the summary object
lm.summary$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2824.67 239.603 11.789 2.748e-24
age 10.93 9.804 1.115 2.662e-01
smokeYes -273.62 106.147 -2.578 1.072e-02
htYes -424.47 212.287 -1.999 4.702e-02
## Generalized linear model: binomial family, logit link
logit.full <- glm(low ~ age + smoke + ht, data = lbw, family = binomial(link = "logit"))
logit.summary <- summary(logit.full)
logit.summary$coefficients
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.01716 0.76890 -0.02232 0.98220
age -0.05043 0.03249 -1.55219 0.12062
smokeYes 0.70089 0.32584 2.15104 0.03147
htYes 1.23394 0.62095 1.98718 0.04690
## Show contents of a data frame without formatting
print.default(lbw[,c("age","bwt","low")])
$age
[1] 19 33 20 21 18 21 22 17 29 26 19 19 22 30 18 18 15 25 20 28 32 31 36 28 25 28 17 29 26 17 17 24 35 25 25 29 19
[38] 27 31 33 21 19 23 21 18 18 32 19 24 22 22 23 22 30 19 16 21 30 20 17 17 23 24 28 26 20 24 28 20 22 22 31 23 16
[75] 16 18 25 32 20 23 22 32 30 20 23 17 19 23 36 22 24 21 19 25 16 29 29 19 19 30 24 19 24 23 20 25 30 22 18 16 32
[112] 18 29 33 20 28 14 28 25 16 20 26 21 22 25 31 35 19 24 45 28 29 34 25 25 27 23 24 24 21 32 19 25 16 25 20 21 24
[149] 21 20 25 19 19 26 24 17 20 22 27 20 17 25 20 18 18 20 21 26 31 15 23 20 24 15 23 30 22 17 23 17 26 20 26 14 28
[186] 14 23 17 21
$bwt
[1] 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 2722 2733 2750 2750 2769 2769 2778 2782 2807 2821 2835 2835
[23] 2836 2863 2877 2877 2906 2920 2920 2920 2920 2948 2948 2977 2977 2977 2977 2992 3005 3033 3042 3062 3062 3062
[45] 3076 3076 3080 3090 3090 3090 3100 3104 3132 3147 3175 3175 3203 3203 3203 3225 3225 3232 3232 3234 3260 3274
[67] 3274 3303 3317 3317 3317 3321 3331 3374 3374 3402 3416 3430 3444 3459 3460 3473 3475 3487 3544 3572 3572 3586
[89] 3600 3614 3614 3629 3629 3637 3643 3651 3651 3651 3651 3699 3728 3756 3770 3770 3770 3790 3799 3827 3856 3860
[111] 3860 3884 3884 3912 3940 3941 3941 3969 3983 3997 3997 4054 4054 4111 4153 4167 4174 4238 4593 4990 709 1021
[133] 1135 1330 1474 1588 1588 1701 1729 1790 1818 1885 1893 1899 1928 1928 1928 1936 1970 2055 2055 2082 2084 2084
[155] 2100 2125 2126 2187 2187 2211 2225 2240 2240 2282 2296 2296 2301 2325 2353 2353 2367 2381 2381 2381 2395 2410
[177] 2410 2414 2424 2438 2442 2450 2466 2466 2466 2495 2495 2495 2495
$low
[1] No No No No No No No No No No No No No No No No No No No No No No No No No No No
[28] No No No No No No No No No No No No No No No No No No No No No No No No No No No
[55] No No No No No No No No No No No No No No No No No No No No No No No No No No No
[82] No No No No No No No No No No No No No No No No No No No No No No No No No No No
[109] No No No No No No No No No No No No No No No No No No No No No No Yes Yes Yes Yes Yes
[136] Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
[163] Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes Yes
Levels: No Yes
attr(,"class")
[1] "data.frame"
## Show contents of a summary object without formatting
print.default(lm.summary)
$call
lm(formula = bwt ~ age + smoke + ht, data = lbw)
$terms
bwt ~ age + smoke + ht
attr(,"variables")
list(bwt, age, smoke, ht)
attr(,"factors")
age smoke ht
bwt 0 0 0
age 1 0 0
smoke 0 1 0
ht 0 0 1
attr(,"term.labels")
[1] "age" "smoke" "ht"
attr(,"order")
[1] 1 1 1
attr(,"intercept")
[1] 1
attr(,"response")
[1] 1
attr(,".Environment")
<environment: R_GlobalEnv>
attr(,"predvars")
list(bwt, age, smoke, ht)
attr(,"dataClasses")
bwt age smoke ht
"numeric" "numeric" "factor" "factor"
$residuals
1 2 3 4 5 6 7 8 9 10 11
-509.401 -634.469 -212.714 -186.647 -147.847 -432.268 -428.201 -373.534 -205.115 -170.315 -310.401
12 13 14 15 16 17 18 19 20 21 22
-299.401 109.264 -402.669 21.153 21.153 -210.667 -42.381 -236.335 -36.182 -339.536 -328.602
23 24 25 26 27 28 29 30 31 32 33
-382.270 -267.802 -221.002 -253.802 169.086 -221.736 84.685 -90.534 -90.534 134.552 14.284
34 35 36 37 38 39 40 41 42 43 44
-121.002 -121.002 108.885 218.219 145.752 115.018 121.151 261.353 29.599 -14.135 7.732
45 46 47 48 49 50 51 52 53 54 55
328.153 328.153 -94.536 57.599 2.932 298.419 459.264 27.865 340.419 267.952 142.599
56 57 58 59 60 61 62 63 64 65 66
175.399 422.353 50.331 159.665 214.466 214.466 155.865 144.932 103.198 424.685 230.665
67 68 69 70 71 72 73 74 75 76 77
186.932 445.818 273.665 251.799 525.419 431.018 528.486 374.399 648.020 380.532 317.998
78 79 80 81 82 83 84 85 86 87 88
529.085 674.286 382.865 394.799 298.464 322.331 443.665 467.865 835.086 539.599 509.865
89 90 91 92 93 94 95 96 97 98 99
381.730 548.799 526.932 574.732 1294.685 812.619 917.020 509.264 509.264 892.219 892.219
100 101 102 103 104 105 106 107 108 109 110
546.331 640.932 1421.685 682.932 693.865 726.665 1116.463 646.331 761.799 1108.153 860.399
111 112 113 114 115 116 117 118 119 120 121
685.464 862.532 1015.885 726.531 1170.286 810.198 963.266 838.198 884.998 997.399 953.665
122 123 124 125 126 127 128 129 130 131 132
945.065 999.732 1045.799 1054.998 1003.398 966.664 1479.219 1505.932 1673.329 -2148.182 -2120.736
133 134 135 136 137 138 139 140 141 142 143
-1363.317 -1343.537 -1624.002 -1531.869 -1488.135 -1386.068 -933.603 -566.182 -1082.915 -873.781 -1205.002
144 145 146 147 148 149 150 151 152 153 154
-1100.601 -896.381 -841.714 -1126.268 -877.448 -1084.268 -988.335 -1043.002 -950.401 -674.781 -751.315
155 156 157 158 159 160 161 162 163 164 165
-987.068 -611.914 -643.714 -604.581 -932.869 -558.714 -511.914 -858.002 -803.335 -739.468 -451.847
166 167 168 169 170 171 172 173 174 175 176
-473.714 -753.268 -783.935 -536.982 -635.667 -435.514 -388.714 -432.448 -607.667 -681.135 -469.048
177 178 179 180 181 182 183 184 185 186 187
-381.581 -322.914 -378.514 -572.534 -242.470 -593.335 -369.315 -238.113 -391.182 -482.734 -307.514
188 189
-91.069 138.818
$coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2824.67 239.603 11.789 2.748e-24
age 10.93 9.804 1.115 2.662e-01
smokeYes -273.62 106.147 -2.578 1.072e-02
htYes -424.47 212.287 -1.999 4.702e-02
$aliased
(Intercept) age smokeYes htYes
FALSE FALSE FALSE FALSE
$sigma
[1] 711.5
$df
[1] 4 185 4
$r.squared
[1] 0.06267
$adj.r.squared
[1] 0.04747
$fstatistic
value numdf dendf
4.123 3.000 185.000
$cov.unscaled
(Intercept) age smokeYes htYes
(Intercept) 0.113403 -0.00445180 -0.01078701 -0.00688818
age -0.004452 0.00018987 0.00009075 0.00006273
smokeYes -0.010787 0.00009075 0.02225653 -0.00056613
htYes -0.006888 0.00006273 -0.00056613 0.08901977
attr(,"class")
[1] "summary.lm"