backpain

# read the file "backpain" from the package "HSAUR3"
data(backpain, package="HSAUR3")
knitr::kable(head(backpain))

ID	status	driver	suburban
1	case	yes	yes
1	control	yes	no
2	case	yes	yes
2	control	yes	yes
3	case	yes	no
3	control	yes	yes

#I try to use tidy R so I call out the function "tibble" and "dplyr" 
library(tibble)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#save the data as tibble
dta <- as_tibble(HSAUR3::backpain)
# read the head of the data and we know the there are four columns "ID" "status" "driver" "suburban"
head(dta)

## # A tibble: 6 x 4
##   ID    status  driver suburban
##   <fct> <fct>   <fct>  <fct>   
## 1 1     case    yes    yes     
## 2 1     control yes    no      
## 3 2     case    yes    yes     
## 4 2     control yes    yes     
## 5 3     case    yes    no      
## 6 3     control yes    yes

# check the dimension and the summary of the data
dim(dta)

## [1] 434   4

summary(dta)

##        ID          status    driver    suburban 
##  1      :  2   case   :217   no : 86   no :200  
##  2      :  2   control:217   yes:348   yes:234  
##  3      :  2                                    
##  4      :  2                                    
##  5      :  2                                    
##  6      :  2                                    
##  (Other):422

# I group the data by "driver, suburban and status" and know the numbers of 8 different categories
dta %>% dplyr::group_by(driver, suburban,status)%>% 
  summarize(total=n())

## # A tibble: 8 x 4
## # Groups:   driver, suburban [4]
##   driver suburban status  total
##   <fct>  <fct>    <fct>   <int>
## 1 no     no       case       26
## 2 no     no       control    47
## 3 no     yes      case        6
## 4 no     yes      control     7
## 5 yes    no       case       64
## 6 yes    no       control    63
## 7 yes    yes      case      121
## 8 yes    yes      control   100

# I try the make the long format into wide format
dta %>% dplyr::group_by(driver, suburban,status)%>% 
  summarize(total=n())%>%
  tidyr::spread(key="status",value="total")

## # A tibble: 4 x 4
## # Groups:   driver, suburban [4]
##   driver suburban  case control
##   <fct>  <fct>    <int>   <int>
## 1 no     no          26      47
## 2 no     yes          6       7
## 3 yes    no          64      63
## 4 yes    yes        121     100

# I add the last column by mutate a new variable
dta %>% dplyr::group_by(driver, suburban,status)%>% 
  summarize(total=n())%>%
  tidyr::spread(key="status",value="total")%>% 
  dplyr::mutate(total=case+control)

## # A tibble: 4 x 5
## # Groups:   driver, suburban [4]
##   driver suburban  case control total
##   <fct>  <fct>    <int>   <int> <int>
## 1 no     no          26      47    73
## 2 no     yes          6       7    13
## 3 yes    no          64      63   127
## 4 yes    yes        121     100   221