pacman::p_load(HSAUR3)
data("backpain", package="HSAUR3")
dta <- HSAUR3::backpain
##先了解backpain裡面有什麼
names(dta)
## [1] "ID" "status" "driver" "suburban"
##兩個(dplyr、tidyr)package一定要借出
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
dta <- dta %>% group_by(driver, suburban) %>%
## group_by()函數的功能為設定分組依據,通常會與summarise()合併使用
## spread() 函數:指定類別標籤(key)與數值(value)的變數名稱將長格式分散為不同變數欄位
## is.na() 測試資料中是否含有遺漏值
tidyr::spread(key= 'status', value = 'status') %>%
summarize(case = sum(is.na(case)),
control = sum(is.na(control)),
total = n()) %>%
as.data.frame
head(dta)
## driver suburban case control total
## 1 no no 38 17 64
## 2 no yes 5 4 11
## 3 yes no 43 44 107
## 4 yes yes 37 58 158
dta1 <- as.data.frame(datasets::state.x77)
dta2 <- datasets::USArrests
head(dta1, 3)
## Population Income Illiteracy Life Exp Murder HS Grad Frost Area
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
## Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
tail(dta2, 3)
## Murder Assault UrbanPop Rape
## West Virginia 5.7 81 39 9.3
## Wisconsin 2.6 53 66 10.8
## Wyoming 6.8 161 60 15.6
##先了解dta1、dta2裡面有什麼
names(dta1)
## [1] "Population" "Income" "Illiteracy" "Life Exp" "Murder"
## [6] "HS Grad" "Frost" "Area"
names(dta2)
## [1] "Murder" "Assault" "UrbanPop" "Rape"
dta1r<- cor(dta1)
dta2r<- cor(dta2)
head(dta1r)
## Population Income Illiteracy Life Exp Murder HS Grad
## Population 1.00000000 0.2082276 0.1076224 -0.06805195 0.3436428 -0.09848975
## Income 0.20822756 1.0000000 -0.4370752 0.34025534 -0.2300776 0.61993232
## Illiteracy 0.10762237 -0.4370752 1.0000000 -0.58847793 0.7029752 -0.65718861
## Life Exp -0.06805195 0.3402553 -0.5884779 1.00000000 -0.7808458 0.58221620
## Murder 0.34364275 -0.2300776 0.7029752 -0.78084575 1.0000000 -0.48797102
## HS Grad -0.09848975 0.6199323 -0.6571886 0.58221620 -0.4879710 1.00000000
## Frost Area
## Population -0.3321525 0.02254384
## Income 0.2262822 0.36331544
## Illiteracy -0.6719470 0.07726113
## Life Exp 0.2620680 -0.10733194
## Murder -0.5388834 0.22839021
## HS Grad 0.3667797 0.33354187
##在Alabama、Alaska、Arizona區域,大於0.7的高相關有 Illiteracy X Murder、Murder X Life Exp
head(dta2r)
## Murder Assault UrbanPop Rape
## Murder 1.00000000 0.8018733 0.06957262 0.5635788
## Assault 0.80187331 1.0000000 0.25887170 0.6652412
## UrbanPop 0.06957262 0.2588717 1.00000000 0.4113412
## Rape 0.56357883 0.6652412 0.41134124 1.0000000
##在West Virginia、Wisconsin、Wyoming,大於0.7的高相關有 Murder X Assault
##Murder在state.x77和Illiteracy、Life Exp有高度相關,在USArrests和Assault有高度相關
##若以單純合併兩資料的變項不同,有許多NA
dtaa <- merge(dta1, dta2, all = TRUE)
head(dtaa)
## Murder Population Income Illiteracy Life Exp HS Grad Frost Area Assault
## 1 0.8 NA NA NA NA NA NA NA 45
## 2 1.4 637 5087 0.8 72.78 50.3 186 69273 NA
## 3 1.7 681 4167 0.5 72.08 53.3 172 75955 NA
## 4 2.1 NA NA NA NA NA NA NA 83
## 5 2.1 NA NA NA NA NA NA NA 57
## 6 2.2 NA NA NA NA NA NA NA 56
## UrbanPop Rape
## 1 44 7.3
## 2 NA NA
## 3 NA NA
## 4 51 7.8
## 5 56 9.5
## 6 57 11.3