# read the file "backpain" from the package "HSAUR3"
data(backpain, package="HSAUR3")
knitr::kable(head(backpain))
ID status driver suburban
1 case yes yes
1 control yes no
2 case yes yes
2 control yes yes
3 case yes no
3 control yes yes
#I try to use tidy R so I call out the function "tibble" and "dplyr" 
library(tibble)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#save the data as tibble
dta <- as_tibble(HSAUR3::backpain)
# read the head of the data and we know the there are four columns "ID" "status" "driver" "suburban"
head(dta)
## # A tibble: 6 x 4
##   ID    status  driver suburban
##   <fct> <fct>   <fct>  <fct>   
## 1 1     case    yes    yes     
## 2 1     control yes    no      
## 3 2     case    yes    yes     
## 4 2     control yes    yes     
## 5 3     case    yes    no      
## 6 3     control yes    yes
# check the dimension and the summary of the data
dim(dta)
## [1] 434   4
summary(dta)
##        ID          status    driver    suburban 
##  1      :  2   case   :217   no : 86   no :200  
##  2      :  2   control:217   yes:348   yes:234  
##  3      :  2                                    
##  4      :  2                                    
##  5      :  2                                    
##  6      :  2                                    
##  (Other):422
# I group the data by "driver, suburban and status" and know the numbers of 8 different categories
dta %>% dplyr::group_by(driver, suburban,status)%>% 
  summarize(total=n())
## # A tibble: 8 x 4
## # Groups:   driver, suburban [4]
##   driver suburban status  total
##   <fct>  <fct>    <fct>   <int>
## 1 no     no       case       26
## 2 no     no       control    47
## 3 no     yes      case        6
## 4 no     yes      control     7
## 5 yes    no       case       64
## 6 yes    no       control    63
## 7 yes    yes      case      121
## 8 yes    yes      control   100
# I try the make the long format into wide format
dta %>% dplyr::group_by(driver, suburban,status)%>% 
  summarize(total=n())%>%
  tidyr::spread(key="status",value="total")
## # A tibble: 4 x 4
## # Groups:   driver, suburban [4]
##   driver suburban  case control
##   <fct>  <fct>    <int>   <int>
## 1 no     no          26      47
## 2 no     yes          6       7
## 3 yes    no          64      63
## 4 yes    yes        121     100
# I add the last column by mutate a new variable
dta %>% dplyr::group_by(driver, suburban,status)%>% 
  summarize(total=n())%>%
  tidyr::spread(key="status",value="total")%>% 
  dplyr::mutate(total=case+control)
## # A tibble: 4 x 5
## # Groups:   driver, suburban [4]
##   driver suburban  case control total
##   <fct>  <fct>    <int>   <int> <int>
## 1 no     no          26      47    73
## 2 no     yes          6       7    13
## 3 yes    no          64      63   127
## 4 yes    yes        121     100   221