# read the file "backpain" from the package "HSAUR3"
data(backpain, package="HSAUR3")
knitr::kable(head(backpain))
| 1 |
case |
yes |
yes |
| 1 |
control |
yes |
no |
| 2 |
case |
yes |
yes |
| 2 |
control |
yes |
yes |
| 3 |
case |
yes |
no |
| 3 |
control |
yes |
yes |
#I try to use tidy R so I call out the function "tibble" and "dplyr"
library(tibble)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#save the data as tibble
dta <- as_tibble(HSAUR3::backpain)
# read the head of the data and we know the there are four columns "ID" "status" "driver" "suburban"
head(dta)
## # A tibble: 6 x 4
## ID status driver suburban
## <fct> <fct> <fct> <fct>
## 1 1 case yes yes
## 2 1 control yes no
## 3 2 case yes yes
## 4 2 control yes yes
## 5 3 case yes no
## 6 3 control yes yes
# check the dimension and the summary of the data
dim(dta)
## [1] 434 4
summary(dta)
## ID status driver suburban
## 1 : 2 case :217 no : 86 no :200
## 2 : 2 control:217 yes:348 yes:234
## 3 : 2
## 4 : 2
## 5 : 2
## 6 : 2
## (Other):422
# I group the data by "driver, suburban and status" and know the numbers of 8 different categories
dta %>% dplyr::group_by(driver, suburban,status)%>%
summarize(total=n())
## # A tibble: 8 x 4
## # Groups: driver, suburban [4]
## driver suburban status total
## <fct> <fct> <fct> <int>
## 1 no no case 26
## 2 no no control 47
## 3 no yes case 6
## 4 no yes control 7
## 5 yes no case 64
## 6 yes no control 63
## 7 yes yes case 121
## 8 yes yes control 100
# I try the make the long format into wide format
dta %>% dplyr::group_by(driver, suburban,status)%>%
summarize(total=n())%>%
tidyr::spread(key="status",value="total")
## # A tibble: 4 x 4
## # Groups: driver, suburban [4]
## driver suburban case control
## <fct> <fct> <int> <int>
## 1 no no 26 47
## 2 no yes 6 7
## 3 yes no 64 63
## 4 yes yes 121 100
# I add the last column by mutate a new variable
dta %>% dplyr::group_by(driver, suburban,status)%>%
summarize(total=n())%>%
tidyr::spread(key="status",value="total")%>%
dplyr::mutate(total=case+control)
## # A tibble: 4 x 5
## # Groups: driver, suburban [4]
## driver suburban case control total
## <fct> <fct> <int> <int> <int>
## 1 no no 26 47 73
## 2 no yes 6 7 13
## 3 yes no 64 63 127
## 4 yes yes 121 100 221