pacman::p_load(HSAUR3)
data("backpain", package="HSAUR3")
dta <- HSAUR3::backpain
##先了解backpain裡面有什麼
names(dta)
## [1] "ID"       "status"   "driver"   "suburban"
##兩個(dplyr、tidyr)package一定要借出
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
dta <- dta %>% group_by(driver, suburban) %>%
  ## group_by()函數的功能為設定分組依據,通常會與summarise()合併使用
  ## spread() 函數:指定類別標籤(key)與數值(value)的變數名稱將長格式分散為不同變數欄位
  ## is.na()    測試資料中是否含有遺漏值
  
  tidyr::spread(key= 'status', value = 'status') %>%
  summarize(case = sum(is.na(case)),
            control = sum(is.na(control)),
            total = n()) %>%
  as.data.frame 

head(dta)
##   driver suburban case control total
## 1     no       no   38      17    64
## 2     no      yes    5       4    11
## 3    yes       no   43      44   107
## 4    yes      yes   37      58   158
dta1 <- as.data.frame(datasets::state.x77)
dta2 <- datasets::USArrests
head(dta1, 3)
##         Population Income Illiteracy Life Exp Murder HS Grad Frost   Area
## Alabama       3615   3624        2.1    69.05   15.1    41.3    20  50708
## Alaska         365   6315        1.5    69.31   11.3    66.7   152 566432
## Arizona       2212   4530        1.8    70.55    7.8    58.1    15 113417
tail(dta2, 3)
##               Murder Assault UrbanPop Rape
## West Virginia    5.7      81       39  9.3
## Wisconsin        2.6      53       66 10.8
## Wyoming          6.8     161       60 15.6
##先了解dta1、dta2裡面有什麼
names(dta1)
## [1] "Population" "Income"     "Illiteracy" "Life Exp"   "Murder"    
## [6] "HS Grad"    "Frost"      "Area"
names(dta2)
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"
dta1r<- cor(dta1)
dta2r<- cor(dta2)


head(dta1r)
##             Population     Income Illiteracy    Life Exp     Murder     HS Grad
## Population  1.00000000  0.2082276  0.1076224 -0.06805195  0.3436428 -0.09848975
## Income      0.20822756  1.0000000 -0.4370752  0.34025534 -0.2300776  0.61993232
## Illiteracy  0.10762237 -0.4370752  1.0000000 -0.58847793  0.7029752 -0.65718861
## Life Exp   -0.06805195  0.3402553 -0.5884779  1.00000000 -0.7808458  0.58221620
## Murder      0.34364275 -0.2300776  0.7029752 -0.78084575  1.0000000 -0.48797102
## HS Grad    -0.09848975  0.6199323 -0.6571886  0.58221620 -0.4879710  1.00000000
##                 Frost        Area
## Population -0.3321525  0.02254384
## Income      0.2262822  0.36331544
## Illiteracy -0.6719470  0.07726113
## Life Exp    0.2620680 -0.10733194
## Murder     -0.5388834  0.22839021
## HS Grad     0.3667797  0.33354187
##在Alabama、Alaska、Arizona區域,大於0.7的高相關有 Illiteracy X Murder、Murder X Life Exp

head(dta2r)
##              Murder   Assault   UrbanPop      Rape
## Murder   1.00000000 0.8018733 0.06957262 0.5635788
## Assault  0.80187331 1.0000000 0.25887170 0.6652412
## UrbanPop 0.06957262 0.2588717 1.00000000 0.4113412
## Rape     0.56357883 0.6652412 0.41134124 1.0000000
##在West Virginia、Wisconsin、Wyoming,大於0.7的高相關有 Murder X Assault

##Murder在state.x77和Illiteracy、Life Exp有高度相關,在USArrests和Assault有高度相關

##若以單純合併兩資料的變項不同,有許多NA
dtaa <- merge(dta1, dta2, all = TRUE)
head(dtaa)
##   Murder Population Income Illiteracy Life Exp HS Grad Frost  Area Assault
## 1    0.8         NA     NA         NA       NA      NA    NA    NA      45
## 2    1.4        637   5087        0.8    72.78    50.3   186 69273      NA
## 3    1.7        681   4167        0.5    72.08    53.3   172 75955      NA
## 4    2.1         NA     NA         NA       NA      NA    NA    NA      83
## 5    2.1         NA     NA         NA       NA      NA    NA    NA      57
## 6    2.2         NA     NA         NA       NA      NA    NA    NA      56
##   UrbanPop Rape
## 1       44  7.3
## 2       NA   NA
## 3       NA   NA
## 4       51  7.8
## 5       56  9.5
## 6       57 11.3