#Exercise 2.2 (Recreate 2 x 2 Table)
Using the counts in Table 2.28 recreate the table using any combination of the matrix, cbind, rbind, dimnames, or names functions.
library(mosaicData)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
wdat<-data("Whickham")
head(wdat <- Whickham)
## outcome smoker age
## 1 Alive Yes 23
## 2 Alive Yes 18
## 3 Dead Yes 71
## 4 Alive No 67
## 5 Alive No 64
## 6 Alive Yes 38
dim(wdat)
## [1] 1314 3
length_s<-wdat %>%
group_by(smoker) %>%
summarize(length_smoker=length(smoker))
length_o<-wdat %>%
group_by(outcome) %>%
summarize(length_outcome=length(outcome))
#table(length_s, length_o)
table(wdat$outcome, wdat$smoker)
##
## No Yes
## Alive 502 443
## Dead 230 139
mat<- matrix(c(139,443,230,502), 2, 2)
dimnames(mat) <- list("Outcome" = c("Dead", "Alive"), Smoker = c("Yes", "No"))
mat
## Smoker
## Outcome Yes No
## Dead 139 230
## Alive 443 502
#Exercise 2.3 (Create table marginal totals) Starting with the 2x2 matrix object we created in Table 2.28, using any combination of apply, cbind, rbind, names, and dimnames functions, recreate the Table 2.29.
You can also embed plots, for example:
mat2<-addmargins(mat)
mat2
## Smoker
## Outcome Yes No Sum
## Dead 139 230 369
## Alive 443 502 945
## Sum 582 732 1314
row<-apply(mat,2,sum)
row
## Yes No
## 582 732
column<-apply(mat,1,sum)
column
## Dead Alive
## 369 945
mat3<-cbind(mat,total=column)
mat4<-rbind(mat3,total=row)
## Warning in rbind(mat3, total = row): number of columns of result is not a
## multiple of vector length (arg 2)
mat4
## Yes No total
## Dead 139 230 369
## Alive 443 502 945
## total 582 732 582
names(dimnames(mat4)) <- c("Outcome", "Smoker")
mat4
## Smoker
## Outcome Yes No total
## Dead 139 230 369
## Alive 443 502 945
## total 582 732 582
#Exercise 2.4 (Create marginal and joint probability distributions) Using the 2x2 data from Table 2.28, use the sweep and apply functions to calculate row, column, and joint probability distributions (i.e., create three tables with proportions).
row
## Yes No
## 582 732
column
## Dead Alive
## 369 945
rowdiv<-sweep(mat, 1, row, "/")
rowdiv
## Smoker
## Outcome Yes No
## Dead 0.2388316 0.3951890
## Alive 0.6051913 0.6857923
coldiv<-sweep(mat, 2, column, "/")
coldiv
## Smoker
## Outcome Yes No
## Dead 0.3766938 0.2433862
## Alive 1.2005420 0.5312169
joint<-mat/1314
joint
## Smoker
## Outcome Yes No
## Dead 0.1057839 0.1750381
## Alive 0.3371385 0.3820396
#Exercise 2.5 (Create measures of association) Using the data from the previous problems, recreate Table 2.30 and interpret the results.
mat5<-t(mat4)
mat5
## Outcome
## Smoker Dead Alive total
## Yes 139 443 582
## No 230 502 732
## total 369 945 582
riske= mat5[1,1]/mat5[1,3]
risku= mat5[2,1]/mat5[2,3]
rr=riske/risku
rr2=riske/riske
oddse=mat5[1,1]/mat5[1,2]
oddse
## [1] 0.3137698
oddsu=mat5[2,1]/mat5[2,2]
oddsu
## [1] 0.4581673
oddsr=oddse/oddsu
oddsr
## [1] 0.6848366
oddsr2=oddse/oddse
info<-cbind("Measure", "Risk", "Risk ratio", "Odds", "Odds ratio", "Yes", riske, rr, oddse, oddsr, "No", risku, rr2, oddsu, oddsr2)
mat6<-matrix(info, 5, 3, byrow = FALSE)
mat6
## [,1] [,2] [,3]
## [1,] "Measure" "Yes" "No"
## [2,] "Risk" "0.238831615120275" "0.314207650273224"
## [3,] "Risk ratio" "0.76010757507844" "1"
## [4,] "Odds" "0.313769751693002" "0.458167330677291"
## [5,] "Odds ratio" "0.68483658847777" "1"
Interpretation
Risk: the risk of death among the women who smoke compared to the women who do not smoke is 24% lower for this study in this period of time.
Odds: the odds of death are 32% lower for women who smoke compared to women who do not smoke for this study in this period of time.
#Exercise 2.6 (Conduct analyses) Install the mosaicData R package using install.packages(“mosaicData”). Load the Whickham data set. Using the xtabs function create two-way and three-way contingency tables. Calculate “measures of association” What is your interpretation?
## discretize age into 4-level categorical variable
library(mosaicData)
library(dplyr)
dat<-Whickham
str(dat)
## 'data.frame': 1314 obs. of 3 variables:
## $ outcome: Factor w/ 2 levels "Alive","Dead": 1 1 2 1 1 1 1 2 1 1 ...
## $ smoker : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 1 1 1 ...
## $ age : int 23 18 71 67 64 38 45 76 28 27 ...
wdat$age2<-cut(dat$age, breaks=c(15, 25, 45, 65, 100), right = FALSE)
table<-xtabs(~outcome+age2+smoker, data=wdat)
table
## , , smoker = No
##
## age2
## outcome [15,25) [25,45) [45,65) [65,100)
## Alive 71 256 147 28
## Dead 1 11 53 165
##
## , , smoker = Yes
##
## age2
## outcome [15,25) [25,45) [45,65) [65,100)
## Alive 53 217 167 6
## Dead 2 13 80 44
table2<-apply(table, c(2,3), sum)
table2
## smoker
## age2 No Yes
## [15,25) 72 55
## [25,45) 267 230
## [45,65) 200 247
## [65,100) 193 50
table3<-sweep(table, c(2,3), table2, "/")
table3
## , , smoker = No
##
## age2
## outcome [15,25) [25,45) [45,65) [65,100)
## Alive 0.98611111 0.95880150 0.73500000 0.14507772
## Dead 0.01388889 0.04119850 0.26500000 0.85492228
##
## , , smoker = Yes
##
## age2
## outcome [15,25) [25,45) [45,65) [65,100)
## Alive 0.96363636 0.94347826 0.67611336 0.12000000
## Dead 0.03636364 0.05652174 0.32388664 0.88000000
Interpretation
When we stratify by age, we observe that there was confounding in the previous exercise. Now we observe that the greater risk is in old (+65 years) smokers.
#Exercise 2.7 (Practice tapply) Use the tapply function to calculate the mean age at study entry comparing smokers to non-smokers, dead vs. alive, smoker status stratified by outcome status (2 x 2 table).
library(mosaicData)
library(dplyr)
dat<-Whickham
tapply(X = dat$age, INDEX = dat$smoker, FUN = mean)
## No Yes
## 48.69945 44.68213
tapply(X = dat$age, INDEX = dat$outcome, FUN = mean)
## Alive Dead
## 40.08466 64.42547
tapply(X=dat$age, list(dat$smoker, dat$outcome), FUN=mean)
## Alive Dead
## No 40.03386 67.61304
## Yes 40.14221 59.15108
#Exercise 2.8 (California home prices, July, 2019) Read into R the median prices for single family homes in selected California counties for July, 2019. The R code is provided. The data object will be a data frame, but can be manipulated like a matrix.
hp <-read.csv("https://raw.githubusercontent.com/taragonmd/data/master/homes.csv",
as.is = TRUE)
dim(hp)
## [1] 51 2
str(hp)
## 'data.frame': 51 obs. of 2 variables:
## $ County: chr "Alameda" "Contra Costa" "Marin" "Napa" ...
## $ Price : int 950000 660000 1257000 685000 1600000 1562500 1298000 465000 655000 611230 ...
a<-as.vector(hp$County)
a
## [1] "Alameda" "Contra Costa" "Marin"
## [4] "Napa" "San Francisco" "San Mateo"
## [7] "Santa Clara" "Solano" "Sonoma"
## [10] "Los Angeles" "Orange" "Riverside"
## [13] "San Bernardino" "San Diego" "Ventura"
## [16] "Monterey" "San Luis Obispo" "Santa Barbara"
## [19] "Santa Cruz" "Fresno" "Glenn"
## [22] "Kern" "Kings" "Madera"
## [25] "Merced" "Placer" "Sacramento"
## [28] "San Benito" "San Joaquin" "Stanislaus"
## [31] "Tulare" "Amador" "Butte"
## [34] "Calaveras" "Del Norte" "El Dorado"
## [37] "Humboldt" "Lake" "Lassen"
## [40] "Mariposa" "Mendocino" "Mono"
## [43] "Nevada" "Plumas" "Shasta"
## [46] "Siskiyou" "Sutter" "Tehama"
## [49] "Tuolumne" "Yolo" "Yuba"
b<-as.vector(hp$Price)
b
## [1] 950000 660000 1257000 685000 1600000 1562500 1298000 465000
## [9] 655000 611230 839450 420000 312000 650000 685000 651780
## [17] 625000 695000 899500 284400 268750 263000 254750 297500
## [25] 283000 496250 390000 570000 380000 335000 245000 312500
## [33] 374280 325500 275000 510500 334370 249000 225000 281400
## [41] 395000 692500 445000 280000 286000 210000 315000 255000
## [49] 300000 485000 299900
c<-a[which.min(b)]
c
## [1] "Siskiyou"
d<-a[which.max(b)]
d
## [1] "San Francisco"
e<-median(b)
e
## [1] 390000
f<-mean(b)
f
## [1] 518432.5
g<-order(hp$County, decreasing=TRUE)
g
## [1] 51 50 15 49 31 48 47 30 9 8 46 45 19 7 18 6 17 29 5 14 13 28 27
## [24] 12 44 26 11 43 4 16 42 25 41 40 3 24 10 39 38 23 22 37 21 20 36 35
## [47] 2 34 33 32 1
g <- hp[order(hp$County),]
g
## County Price
## 1 Alameda 950000
## 32 Amador 312500
## 33 Butte 374280
## 34 Calaveras 325500
## 2 Contra Costa 660000
## 35 Del Norte 275000
## 36 El Dorado 510500
## 20 Fresno 284400
## 21 Glenn 268750
## 37 Humboldt 334370
## 22 Kern 263000
## 23 Kings 254750
## 38 Lake 249000
## 39 Lassen 225000
## 10 Los Angeles 611230
## 24 Madera 297500
## 3 Marin 1257000
## 40 Mariposa 281400
## 41 Mendocino 395000
## 25 Merced 283000
## 42 Mono 692500
## 16 Monterey 651780
## 4 Napa 685000
## 43 Nevada 445000
## 11 Orange 839450
## 26 Placer 496250
## 44 Plumas 280000
## 12 Riverside 420000
## 27 Sacramento 390000
## 28 San Benito 570000
## 13 San Bernardino 312000
## 14 San Diego 650000
## 5 San Francisco 1600000
## 29 San Joaquin 380000
## 17 San Luis Obispo 625000
## 6 San Mateo 1562500
## 18 Santa Barbara 695000
## 7 Santa Clara 1298000
## 19 Santa Cruz 899500
## 45 Shasta 286000
## 46 Siskiyou 210000
## 8 Solano 465000
## 9 Sonoma 655000
## 30 Stanislaus 335000
## 47 Sutter 315000
## 48 Tehama 255000
## 31 Tulare 245000
## 49 Tuolumne 300000
## 15 Ventura 685000
## 50 Yolo 485000
## 51 Yuba 299900
h<-hp[order(hp$Price),]
h
## County Price
## 46 Siskiyou 210000
## 39 Lassen 225000
## 31 Tulare 245000
## 38 Lake 249000
## 23 Kings 254750
## 48 Tehama 255000
## 22 Kern 263000
## 21 Glenn 268750
## 35 Del Norte 275000
## 44 Plumas 280000
## 40 Mariposa 281400
## 25 Merced 283000
## 20 Fresno 284400
## 45 Shasta 286000
## 24 Madera 297500
## 51 Yuba 299900
## 49 Tuolumne 300000
## 13 San Bernardino 312000
## 32 Amador 312500
## 47 Sutter 315000
## 34 Calaveras 325500
## 37 Humboldt 334370
## 30 Stanislaus 335000
## 33 Butte 374280
## 29 San Joaquin 380000
## 27 Sacramento 390000
## 41 Mendocino 395000
## 12 Riverside 420000
## 43 Nevada 445000
## 8 Solano 465000
## 50 Yolo 485000
## 26 Placer 496250
## 36 El Dorado 510500
## 28 San Benito 570000
## 10 Los Angeles 611230
## 17 San Luis Obispo 625000
## 14 San Diego 650000
## 16 Monterey 651780
## 9 Sonoma 655000
## 2 Contra Costa 660000
## 4 Napa 685000
## 15 Ventura 685000
## 42 Mono 692500
## 18 Santa Barbara 695000
## 11 Orange 839450
## 19 Santa Cruz 899500
## 1 Alameda 950000
## 3 Marin 1257000
## 7 Santa Clara 1298000
## 6 San Mateo 1562500
## 5 San Francisco 1600000
i<-quantile(hp$Price)
i
## 0% 25% 50% 75% 100%
## 210000 285200 390000 653390 1600000
hp[which(hp$Price>285200 & hp$Price<653390),]
## County Price
## 8 Solano 465000
## 10 Los Angeles 611230
## 12 Riverside 420000
## 13 San Bernardino 312000
## 14 San Diego 650000
## 16 Monterey 651780
## 17 San Luis Obispo 625000
## 24 Madera 297500
## 26 Placer 496250
## 27 Sacramento 390000
## 28 San Benito 570000
## 29 San Joaquin 380000
## 30 Stanislaus 335000
## 32 Amador 312500
## 33 Butte 374280
## 34 Calaveras 325500
## 36 El Dorado 510500
## 37 Humboldt 334370
## 41 Mendocino 395000
## 43 Nevada 445000
## 45 Shasta 286000
## 47 Sutter 315000
## 49 Tuolumne 300000
## 50 Yolo 485000
## 51 Yuba 299900