#Exercise 2.2 (Recreate 2 x 2 Table)

Using the counts in Table 2.28 recreate the table using any combination of the matrix, cbind, rbind, dimnames, or names functions.

library(mosaicData)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
wdat<-data("Whickham")
head(wdat <- Whickham)
##   outcome smoker age
## 1   Alive    Yes  23
## 2   Alive    Yes  18
## 3    Dead    Yes  71
## 4   Alive     No  67
## 5   Alive     No  64
## 6   Alive    Yes  38
dim(wdat)
## [1] 1314    3
length_s<-wdat %>%
  group_by(smoker) %>%
  summarize(length_smoker=length(smoker))

length_o<-wdat %>%
  group_by(outcome) %>%
  summarize(length_outcome=length(outcome))
  #table(length_s, length_o)

table(wdat$outcome, wdat$smoker)
##        
##          No Yes
##   Alive 502 443
##   Dead  230 139
mat<- matrix(c(139,443,230,502), 2, 2)
dimnames(mat) <- list("Outcome" = c("Dead", "Alive"), Smoker = c("Yes", "No"))
mat
##        Smoker
## Outcome Yes  No
##   Dead  139 230
##   Alive 443 502

#Exercise 2.3 (Create table marginal totals) Starting with the 2x2 matrix object we created in Table 2.28, using any combination of apply, cbind, rbind, names, and dimnames functions, recreate the Table 2.29.

You can also embed plots, for example:

mat2<-addmargins(mat)
mat2
##        Smoker
## Outcome Yes  No  Sum
##   Dead  139 230  369
##   Alive 443 502  945
##   Sum   582 732 1314
row<-apply(mat,2,sum)
row
## Yes  No 
## 582 732
column<-apply(mat,1,sum)
column
##  Dead Alive 
##   369   945
mat3<-cbind(mat,total=column)
mat4<-rbind(mat3,total=row)
## Warning in rbind(mat3, total = row): number of columns of result is not a
## multiple of vector length (arg 2)
mat4
##       Yes  No total
## Dead  139 230   369
## Alive 443 502   945
## total 582 732   582
names(dimnames(mat4)) <- c("Outcome", "Smoker")
mat4
##        Smoker
## Outcome Yes  No total
##   Dead  139 230   369
##   Alive 443 502   945
##   total 582 732   582

#Exercise 2.4 (Create marginal and joint probability distributions) Using the 2x2 data from Table 2.28, use the sweep and apply functions to calculate row, column, and joint probability distributions (i.e., create three tables with proportions).

row
## Yes  No 
## 582 732
column
##  Dead Alive 
##   369   945
rowdiv<-sweep(mat, 1, row, "/")
rowdiv
##        Smoker
## Outcome       Yes        No
##   Dead  0.2388316 0.3951890
##   Alive 0.6051913 0.6857923
coldiv<-sweep(mat, 2, column, "/")
coldiv
##        Smoker
## Outcome       Yes        No
##   Dead  0.3766938 0.2433862
##   Alive 1.2005420 0.5312169
joint<-mat/1314 
joint
##        Smoker
## Outcome       Yes        No
##   Dead  0.1057839 0.1750381
##   Alive 0.3371385 0.3820396

#Exercise 2.5 (Create measures of association) Using the data from the previous problems, recreate Table 2.30 and interpret the results.

mat5<-t(mat4)
mat5
##        Outcome
## Smoker  Dead Alive total
##   Yes    139   443   582
##   No     230   502   732
##   total  369   945   582
riske= mat5[1,1]/mat5[1,3]
risku= mat5[2,1]/mat5[2,3]
rr=riske/risku
rr2=riske/riske

oddse=mat5[1,1]/mat5[1,2]
oddse
## [1] 0.3137698
oddsu=mat5[2,1]/mat5[2,2]
oddsu
## [1] 0.4581673
oddsr=oddse/oddsu
oddsr
## [1] 0.6848366
oddsr2=oddse/oddse

info<-cbind("Measure", "Risk", "Risk ratio", "Odds", "Odds ratio", "Yes", riske, rr, oddse, oddsr, "No", risku, rr2, oddsu, oddsr2)
mat6<-matrix(info, 5, 3, byrow = FALSE)
mat6
##      [,1]         [,2]                [,3]               
## [1,] "Measure"    "Yes"               "No"               
## [2,] "Risk"       "0.238831615120275" "0.314207650273224"
## [3,] "Risk ratio" "0.76010757507844"  "1"                
## [4,] "Odds"       "0.313769751693002" "0.458167330677291"
## [5,] "Odds ratio" "0.68483658847777"  "1"

Interpretation

Risk: the risk of death among the women who smoke compared to the women who do not smoke is 24% lower for this study in this period of time.

Odds: the odds of death are 32% lower for women who smoke compared to women who do not smoke for this study in this period of time.

#Exercise 2.6 (Conduct analyses) Install the mosaicData R package using install.packages(“mosaicData”). Load the Whickham data set. Using the xtabs function create two-way and three-way contingency tables. Calculate “measures of association” What is your interpretation?

## discretize age into 4-level categorical variable
library(mosaicData)
library(dplyr)
dat<-Whickham
str(dat)
## 'data.frame':    1314 obs. of  3 variables:
##  $ outcome: Factor w/ 2 levels "Alive","Dead": 1 1 2 1 1 1 1 2 1 1 ...
##  $ smoker : Factor w/ 2 levels "No","Yes": 2 2 2 1 1 2 2 1 1 1 ...
##  $ age    : int  23 18 71 67 64 38 45 76 28 27 ...
wdat$age2<-cut(dat$age, breaks=c(15, 25, 45, 65, 100), right = FALSE)
table<-xtabs(~outcome+age2+smoker, data=wdat)
table
## , , smoker = No
## 
##        age2
## outcome [15,25) [25,45) [45,65) [65,100)
##   Alive      71     256     147       28
##   Dead        1      11      53      165
## 
## , , smoker = Yes
## 
##        age2
## outcome [15,25) [25,45) [45,65) [65,100)
##   Alive      53     217     167        6
##   Dead        2      13      80       44
table2<-apply(table, c(2,3), sum)
table2
##           smoker
## age2        No Yes
##   [15,25)   72  55
##   [25,45)  267 230
##   [45,65)  200 247
##   [65,100) 193  50
table3<-sweep(table, c(2,3), table2, "/")
table3
## , , smoker = No
## 
##        age2
## outcome    [15,25)    [25,45)    [45,65)   [65,100)
##   Alive 0.98611111 0.95880150 0.73500000 0.14507772
##   Dead  0.01388889 0.04119850 0.26500000 0.85492228
## 
## , , smoker = Yes
## 
##        age2
## outcome    [15,25)    [25,45)    [45,65)   [65,100)
##   Alive 0.96363636 0.94347826 0.67611336 0.12000000
##   Dead  0.03636364 0.05652174 0.32388664 0.88000000

Interpretation

When we stratify by age, we observe that there was confounding in the previous exercise. Now we observe that the greater risk is in old (+65 years) smokers.

#Exercise 2.7 (Practice tapply) Use the tapply function to calculate the mean age at study entry comparing smokers to non-smokers, dead vs. alive, smoker status stratified by outcome status (2 x 2 table).

library(mosaicData)
library(dplyr)
dat<-Whickham

tapply(X = dat$age, INDEX = dat$smoker, FUN = mean)
##       No      Yes 
## 48.69945 44.68213
tapply(X = dat$age, INDEX = dat$outcome, FUN = mean)
##    Alive     Dead 
## 40.08466 64.42547
tapply(X=dat$age, list(dat$smoker, dat$outcome), FUN=mean)
##        Alive     Dead
## No  40.03386 67.61304
## Yes 40.14221 59.15108

#Exercise 2.8 (California home prices, July, 2019) Read into R the median prices for single family homes in selected California counties for July, 2019. The R code is provided. The data object will be a data frame, but can be manipulated like a matrix.

  1. Create a character vector of the county names.
  2. Create a numeric vector of the home prices.
  3. Which county has the lowest price?
  4. Which county has the highest price?
  5. What is the median price for California counties? (i.e., the median of the median prices)
  6. What is the mean price? (i.e., the mean of the median prices) The data frame (hp) can be treated like a matrix. Sort the data frame by county name (hint: use the order function).
  7. Sort the data frame by home price (hint: use the order function).
  8. List the counties that have medican home prices between the 25% and 75% percentiles (hint: use the quantile function and index using a Boolean query).
hp <-read.csv("https://raw.githubusercontent.com/taragonmd/data/master/homes.csv",
as.is = TRUE)
dim(hp)
## [1] 51  2
str(hp)
## 'data.frame':    51 obs. of  2 variables:
##  $ County: chr  "Alameda" "Contra Costa" "Marin" "Napa" ...
##  $ Price : int  950000 660000 1257000 685000 1600000 1562500 1298000 465000 655000 611230 ...
a<-as.vector(hp$County)
a
##  [1] "Alameda"         "Contra Costa"    "Marin"          
##  [4] "Napa"            "San Francisco"   "San Mateo"      
##  [7] "Santa Clara"     "Solano"          "Sonoma"         
## [10] "Los Angeles"     "Orange"          "Riverside"      
## [13] "San Bernardino"  "San Diego"       "Ventura"        
## [16] "Monterey"        "San Luis Obispo" "Santa Barbara"  
## [19] "Santa Cruz"      "Fresno"          "Glenn"          
## [22] "Kern"            "Kings"           "Madera"         
## [25] "Merced"          "Placer"          "Sacramento"     
## [28] "San Benito"      "San Joaquin"     "Stanislaus"     
## [31] "Tulare"          "Amador"          "Butte"          
## [34] "Calaveras"       "Del Norte"       "El Dorado"      
## [37] "Humboldt"        "Lake"            "Lassen"         
## [40] "Mariposa"        "Mendocino"       "Mono"           
## [43] "Nevada"          "Plumas"          "Shasta"         
## [46] "Siskiyou"        "Sutter"          "Tehama"         
## [49] "Tuolumne"        "Yolo"            "Yuba"
b<-as.vector(hp$Price)
b
##  [1]  950000  660000 1257000  685000 1600000 1562500 1298000  465000
##  [9]  655000  611230  839450  420000  312000  650000  685000  651780
## [17]  625000  695000  899500  284400  268750  263000  254750  297500
## [25]  283000  496250  390000  570000  380000  335000  245000  312500
## [33]  374280  325500  275000  510500  334370  249000  225000  281400
## [41]  395000  692500  445000  280000  286000  210000  315000  255000
## [49]  300000  485000  299900
c<-a[which.min(b)]
c
## [1] "Siskiyou"
d<-a[which.max(b)]
d
## [1] "San Francisco"
e<-median(b)
e
## [1] 390000
f<-mean(b)
f
## [1] 518432.5
g<-order(hp$County, decreasing=TRUE)
g
##  [1] 51 50 15 49 31 48 47 30  9  8 46 45 19  7 18  6 17 29  5 14 13 28 27
## [24] 12 44 26 11 43  4 16 42 25 41 40  3 24 10 39 38 23 22 37 21 20 36 35
## [47]  2 34 33 32  1
g <- hp[order(hp$County),]
g
##             County   Price
## 1          Alameda  950000
## 32          Amador  312500
## 33           Butte  374280
## 34       Calaveras  325500
## 2     Contra Costa  660000
## 35       Del Norte  275000
## 36       El Dorado  510500
## 20          Fresno  284400
## 21           Glenn  268750
## 37        Humboldt  334370
## 22            Kern  263000
## 23           Kings  254750
## 38            Lake  249000
## 39          Lassen  225000
## 10     Los Angeles  611230
## 24          Madera  297500
## 3            Marin 1257000
## 40        Mariposa  281400
## 41       Mendocino  395000
## 25          Merced  283000
## 42            Mono  692500
## 16        Monterey  651780
## 4             Napa  685000
## 43          Nevada  445000
## 11          Orange  839450
## 26          Placer  496250
## 44          Plumas  280000
## 12       Riverside  420000
## 27      Sacramento  390000
## 28      San Benito  570000
## 13  San Bernardino  312000
## 14       San Diego  650000
## 5    San Francisco 1600000
## 29     San Joaquin  380000
## 17 San Luis Obispo  625000
## 6        San Mateo 1562500
## 18   Santa Barbara  695000
## 7      Santa Clara 1298000
## 19      Santa Cruz  899500
## 45          Shasta  286000
## 46        Siskiyou  210000
## 8           Solano  465000
## 9           Sonoma  655000
## 30      Stanislaus  335000
## 47          Sutter  315000
## 48          Tehama  255000
## 31          Tulare  245000
## 49        Tuolumne  300000
## 15         Ventura  685000
## 50            Yolo  485000
## 51            Yuba  299900
h<-hp[order(hp$Price),]
h
##             County   Price
## 46        Siskiyou  210000
## 39          Lassen  225000
## 31          Tulare  245000
## 38            Lake  249000
## 23           Kings  254750
## 48          Tehama  255000
## 22            Kern  263000
## 21           Glenn  268750
## 35       Del Norte  275000
## 44          Plumas  280000
## 40        Mariposa  281400
## 25          Merced  283000
## 20          Fresno  284400
## 45          Shasta  286000
## 24          Madera  297500
## 51            Yuba  299900
## 49        Tuolumne  300000
## 13  San Bernardino  312000
## 32          Amador  312500
## 47          Sutter  315000
## 34       Calaveras  325500
## 37        Humboldt  334370
## 30      Stanislaus  335000
## 33           Butte  374280
## 29     San Joaquin  380000
## 27      Sacramento  390000
## 41       Mendocino  395000
## 12       Riverside  420000
## 43          Nevada  445000
## 8           Solano  465000
## 50            Yolo  485000
## 26          Placer  496250
## 36       El Dorado  510500
## 28      San Benito  570000
## 10     Los Angeles  611230
## 17 San Luis Obispo  625000
## 14       San Diego  650000
## 16        Monterey  651780
## 9           Sonoma  655000
## 2     Contra Costa  660000
## 4             Napa  685000
## 15         Ventura  685000
## 42            Mono  692500
## 18   Santa Barbara  695000
## 11          Orange  839450
## 19      Santa Cruz  899500
## 1          Alameda  950000
## 3            Marin 1257000
## 7      Santa Clara 1298000
## 6        San Mateo 1562500
## 5    San Francisco 1600000
i<-quantile(hp$Price)
i
##      0%     25%     50%     75%    100% 
##  210000  285200  390000  653390 1600000
hp[which(hp$Price>285200 & hp$Price<653390),]
##             County  Price
## 8           Solano 465000
## 10     Los Angeles 611230
## 12       Riverside 420000
## 13  San Bernardino 312000
## 14       San Diego 650000
## 16        Monterey 651780
## 17 San Luis Obispo 625000
## 24          Madera 297500
## 26          Placer 496250
## 27      Sacramento 390000
## 28      San Benito 570000
## 29     San Joaquin 380000
## 30      Stanislaus 335000
## 32          Amador 312500
## 33           Butte 374280
## 34       Calaveras 325500
## 36       El Dorado 510500
## 37        Humboldt 334370
## 41       Mendocino 395000
## 43          Nevada 445000
## 45          Shasta 286000
## 47          Sutter 315000
## 49        Tuolumne 300000
## 50            Yolo 485000
## 51            Yuba 299900