The AAUP2 data set is a comma-delimited fixed column format text file with ’*’ for missing value. Import the file into R and indicate missing values by ‘NA’. Hint: ?read.csv
fL <- "D:/User/Desktop/datamanagement/20211025/aaup2.txt"
dta_1 <- read.table(fL, header=T, sep='\t')
readr::fwf_empty("/User/Desktop/datamanagement/20211025/aaup2.txt")[1:2]
## $begin
## [1] 0 6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
##
## $end
## [1] 5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
#直接匯入txt發現資料很亂,檢視txt原檔,資料確實一團,嘗試用readr::fwf_empty
dta_1 <- readr::read_fwf("/User/Desktop/datamanagement/20211025/aaup2.txt", skip=1,readr::fwf_cols(X1=6, X2=34, X3=4, X4=4, X5=4, X6=4,X7=4,X8=5,X9=4,X10=4,X11=5,X12=4,X13=4,X14=4,X15=3,X16=5))
## Rows: 1160 Columns: 16
## -- Column specification --------------------------------------------------------
##
## chr (9): X2, X3, X4, X5, X6, X8, X9, X10, X16
## dbl (7): X1, X7, X11, X12, X13, X14, X15
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dplyr::glimpse(dta_1) #glimpse(), 類似 {R} base 的 str() 函式
## Rows: 1,160
## Columns: 16
## $ X1 <dbl> 1063, 1065, 11462, 1002, 1004, 1008, 1009, 1012, 1016, 1019, 1020,~
## $ X2 <chr> "Univ.Alaska-Fairbanks AK", "Univ.Alaska-Southeast ~
## $ X3 <chr> "I", "IIA", "IIA", "IIA", "IIA", "IIB", "I", "IIB", "IIB", "IIB", ~
## $ X4 <chr> "686", "533", "612", "442", "441", "466", "580", "498", "506", "33~
## $ X5 <chr> "560", "494", "507", "369", "385", "394", "437", "379", "412", "30~
## $ X6 <chr> "432", "329", "414", "310", "310", "351", "374", "322", "359", "28~
## $ X7 <dbl> 508, 415, 498, 350, 388, 396, 455, 401, 411, 301, 386, 300, 291, 2~
## $ X8 <chr> "914", "716", "825", "530", "542", "558", "692", "655", "607", "42~
## $ X9 <chr> "753", "663", "681", "444", "473", "476", "527", "501", "508", "37~
## $ X10 <chr> "572", "442", "557", "376", "383", "427", "451", "404", "445", "34~
## $ X11 <dbl> 677, 559, 670, 423, 477, 478, 546, 523, 503, 366, 493, 363, 363, 3~
## $ X12 <dbl> 74, 9, 115, 59, 57, 20, 366, 34, 67, 8, 106, 27, 17, 18, 83, 23, 1~
## $ X13 <dbl> 125, 26, 124, 77, 33, 18, 354, 25, 40, 15, 42, 25, 19, 28, 46, 17,~
## $ X14 <dbl> 118, 20, 101, 102, 35, 30, 301, 27, 66, 19, 66, 33, 31, 28, 77, 14~
## $ X15 <dbl> 4, NA, 2, 2, NA, NA, 6, NA, 2, NA, 5, NA, 1, NA, NA, NA, 1, 1, 4, ~
## $ X16 <chr> "0 40", "9 7", "1 39", "4 26", "2 12", "0 6", "6 110", "3 ~
str(dta_1)
## spec_tbl_df [1,160 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X1 : num [1:1160] 1063 1065 11462 1002 1004 ...
## $ X2 : chr [1:1160] "Univ.Alaska-Fairbanks AK" "Univ.Alaska-Southeast AK" "Univ.Alaska-Anchorage AK" "Alabama Agri.&Mech. Univ. AL" ...
## $ X3 : chr [1:1160] "I" "IIA" "IIA" "IIA" ...
## $ X4 : chr [1:1160] "686" "533" "612" "442" ...
## $ X5 : chr [1:1160] "560" "494" "507" "369" ...
## $ X6 : chr [1:1160] "432" "329" "414" "310" ...
## $ X7 : num [1:1160] 508 415 498 350 388 396 455 401 411 301 ...
## $ X8 : chr [1:1160] "914" "716" "825" "530" ...
## $ X9 : chr [1:1160] "753" "663" "681" "444" ...
## $ X10: chr [1:1160] "572" "442" "557" "376" ...
## $ X11: num [1:1160] 677 559 670 423 477 478 546 523 503 366 ...
## $ X12: num [1:1160] 74 9 115 59 57 20 366 34 67 8 ...
## $ X13: num [1:1160] 125 26 124 77 33 18 354 25 40 15 ...
## $ X14: num [1:1160] 118 20 101 102 35 30 301 27 66 19 ...
## $ X15: num [1:1160] 4 NA 2 2 NA NA 6 NA 2 NA ...
## $ X16: chr [1:1160] "0 40" "9 7" "1 39" "4 26" ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. X2 = col_character(),
## .. X3 = col_character(),
## .. X4 = col_character(),
## .. X5 = col_character(),
## .. X6 = col_character(),
## .. X7 = col_double(),
## .. X8 = col_character(),
## .. X9 = col_character(),
## .. X10 = col_character(),
## .. X11 = col_double(),
## .. X12 = col_double(),
## .. X13 = col_double(),
## .. X14 = col_double(),
## .. X15 = col_double(),
## .. X16 = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
dta_1 |> as.data.frame() |> head()
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## 1 1063 Univ.Alaska-Fairbanks AK I 686 560 432 508 914 753 572 677
## 2 1065 Univ.Alaska-Southeast AK IIA 533 494 329 415 716 663 442 559
## 3 11462 Univ.Alaska-Anchorage AK IIA 612 507 414 498 825 681 557 670
## 4 1002 Alabama Agri.&Mech. Univ. AL IIA 442 369 310 350 530 444 376 423
## 5 1004 University of Montevallo AL IIA 441 385 310 388 542 473 383 477
## 6 1008 Athens State College AL IIB 466 394 351 396 558 476 427 478
## X12 X13 X14 X15 X16
## 1 74 125 118 4 0 40
## 2 9 26 20 NA 9 7
## 3 115 124 101 2 1 39
## 4 59 77 102 2 4 26
## 5 57 33 35 NA 2 12
## 6 20 18 30 NA 0 6
1.直接匯入txt發現資料很亂,檢視txt原檔,資料確實一團,嘗試用readr::fwf_empty做資料間的字元區隔
2.依照begin的結果設定字元數間格 $begin [1] 0 6 40 45 49 53 57 61 66 70 74 79 83 87 92 95 $end [1] 5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
3.但做成dta |> as.data.frame()發現X15(94-92+1=3?).X16(NA-95),隨意設一個比較大的值6)之間的資料很亂?看dta_1 talbe發現X15.X16的資料沒有切清楚(X15的資料字元超過3),導致於X15很多NA
4.glimpse()和str() 函式功能類似
fL <- "D:/User/Desktop/datamanagement/20211025/aaup2.txt"
dta_1a <- read.table(fL, sep='\t')
readr::fwf_empty("/User/Desktop/datamanagement/20211025/aaup2.txt")[1:2]
## $begin
## [1] 0 6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
##
## $end
## [1] 5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
dta_1a <- readr::read_fwf("/User/Desktop/datamanagement/20211025/aaup2.txt", skip=1,readr::fwf_cols(X1=6, X2=34, X3=4, X4=4, X5=4, X6=4,X7=4,X8=5,X9=4,X10=4,X11=5,X12=4,X13=4,X14=4,X15=4,X16=5)) #X15改成=4
## Rows: 1160 Columns: 16
## -- Column specification --------------------------------------------------------
##
## chr (8): X2, X3, X4, X5, X6, X8, X9, X10
## dbl (8): X1, X7, X11, X12, X13, X14, X15, X16
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dplyr::glimpse(dta_1a)
## Rows: 1,160
## Columns: 16
## $ X1 <dbl> 1063, 1065, 11462, 1002, 1004, 1008, 1009, 1012, 1016, 1019, 1020,~
## $ X2 <chr> "Univ.Alaska-Fairbanks AK", "Univ.Alaska-Southeast ~
## $ X3 <chr> "I", "IIA", "IIA", "IIA", "IIA", "IIB", "I", "IIB", "IIB", "IIB", ~
## $ X4 <chr> "686", "533", "612", "442", "441", "466", "580", "498", "506", "33~
## $ X5 <chr> "560", "494", "507", "369", "385", "394", "437", "379", "412", "30~
## $ X6 <chr> "432", "329", "414", "310", "310", "351", "374", "322", "359", "28~
## $ X7 <dbl> 508, 415, 498, 350, 388, 396, 455, 401, 411, 301, 386, 300, 291, 2~
## $ X8 <chr> "914", "716", "825", "530", "542", "558", "692", "655", "607", "42~
## $ X9 <chr> "753", "663", "681", "444", "473", "476", "527", "501", "508", "37~
## $ X10 <chr> "572", "442", "557", "376", "383", "427", "451", "404", "445", "34~
## $ X11 <dbl> 677, 559, 670, 423, 477, 478, 546, 523, 503, 366, 493, 363, 363, 3~
## $ X12 <dbl> 74, 9, 115, 59, 57, 20, 366, 34, 67, 8, 106, 27, 17, 18, 83, 23, 1~
## $ X13 <dbl> 125, 26, 124, 77, 33, 18, 354, 25, 40, 15, 42, 25, 19, 28, 46, 17,~
## $ X14 <dbl> 118, 20, 101, 102, 35, 30, 301, 27, 66, 19, 66, 33, 31, 28, 77, 14~
## $ X15 <dbl> 40, 9, 21, 24, 2, 0, 66, 3, 27, 2, 58, 4, 19, 3, 9, 1, 10, 19, 45,~
## $ X16 <dbl> 404, 70, 392, 262, 127, 68, 1109, 89, 200, 44, 272, 89, 86, 77, 21~
dta_1a |> as.data.frame() |> head()
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11
## 1 1063 Univ.Alaska-Fairbanks AK I 686 560 432 508 914 753 572 677
## 2 1065 Univ.Alaska-Southeast AK IIA 533 494 329 415 716 663 442 559
## 3 11462 Univ.Alaska-Anchorage AK IIA 612 507 414 498 825 681 557 670
## 4 1002 Alabama Agri.&Mech. Univ. AL IIA 442 369 310 350 530 444 376 423
## 5 1004 University of Montevallo AL IIA 441 385 310 388 542 473 383 477
## 6 1008 Athens State College AL IIB 466 394 351 396 558 476 427 478
## X12 X13 X14 X15 X16
## 1 74 125 118 40 404
## 2 9 26 20 9 70
## 3 115 124 101 21 392
## 4 59 77 102 24 262
## 5 57 33 35 2 127
## 6 20 18 30 0 68
str(dta_1a)
## spec_tbl_df [1,160 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X1 : num [1:1160] 1063 1065 11462 1002 1004 ...
## $ X2 : chr [1:1160] "Univ.Alaska-Fairbanks AK" "Univ.Alaska-Southeast AK" "Univ.Alaska-Anchorage AK" "Alabama Agri.&Mech. Univ. AL" ...
## $ X3 : chr [1:1160] "I" "IIA" "IIA" "IIA" ...
## $ X4 : chr [1:1160] "686" "533" "612" "442" ...
## $ X5 : chr [1:1160] "560" "494" "507" "369" ...
## $ X6 : chr [1:1160] "432" "329" "414" "310" ...
## $ X7 : num [1:1160] 508 415 498 350 388 396 455 401 411 301 ...
## $ X8 : chr [1:1160] "914" "716" "825" "530" ...
## $ X9 : chr [1:1160] "753" "663" "681" "444" ...
## $ X10: chr [1:1160] "572" "442" "557" "376" ...
## $ X11: num [1:1160] 677 559 670 423 477 478 546 523 503 366 ...
## $ X12: num [1:1160] 74 9 115 59 57 20 366 34 67 8 ...
## $ X13: num [1:1160] 125 26 124 77 33 18 354 25 40 15 ...
## $ X14: num [1:1160] 118 20 101 102 35 30 301 27 66 19 ...
## $ X15: num [1:1160] 40 9 21 24 2 0 66 3 27 2 ...
## $ X16: num [1:1160] 404 70 392 262 127 ...
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. X2 = col_character(),
## .. X3 = col_character(),
## .. X4 = col_character(),
## .. X5 = col_character(),
## .. X6 = col_character(),
## .. X7 = col_double(),
## .. X8 = col_character(),
## .. X9 = col_character(),
## .. X10 = col_character(),
## .. X11 = col_double(),
## .. X12 = col_double(),
## .. X13 = col_double(),
## .. X14 = col_double(),
## .. X15 = col_double(),
## .. X16 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
1.readr::fwf_cols(…X15=4,X16=5),資料就切乾淨了。
匯入資料之後,還是應該看一下dta,以確認正確性。
Here is a copy of the student roster in csv format from NCKU for a course I taught. Dispaly the number of students from each major.
dta_2<-read.csv("D:/User/Desktop/datamanagement/20211025/ncku_roster.csv")
dta_2 |> knitr::kable()
| 座號 | 系.年.班 | 開課系序號 | 學號 | 姓名 | 成績 | 選課時間 |
|---|---|---|---|---|---|---|
| 教師:U3023 許清芳 | 上課時間: 一[6-8];開課號:U3006 U7031 | 科目:資料管理 | NA | |||
| 1 | 心理系 3 | U7031 | D840239 | 蘇 | NA | 02/17/2016 09:17:40 |
| 2 | 心理系 3 | U7031 | D840057 | 吳 | NA | 02/17/2016 09:17:28 |
| 3 | 心理系 4 | U7031 | D841311 | 余 | NA | 02/17/2016 09:09:10 |
| 4 | 心理系 4 | U7031 | D840140 | 王 | NA | 02/17/2016 09:09:34 |
| 5 | 教育所 1 碩 | U3006 | U360098 | 劉 | NA | 01/18/2016 14:56:35 |
| 6 | 教育所 1 博 | U3006 | U380416 | 陳 | NA | 01/25/2016 16:01:08 |
| 7 | 教育所 2 碩 | U3006 | U360311 | 林 | NA | 02/17/2016 11:58:44 |
| 8 | 教育所 2 博 | U3006 | U380020 | 蔡 | NA | 02/17/2016 12:42:06 |
| 9 | 心理所 1 碩 | U7031 | U760464 | 葉 | NA | 02/17/2016 15:46:51 |
| 10 | 心理所 1 碩 | U7031 | U760480 | 王 | NA | 02/17/2016 11:39:53 |
| 11 | 心理所 1 碩 | U7031 | U760420 | 陳 | NA | 02/17/2016 09:19:38 |
| 12 | 心理所 1 碩 | U7031 | U760038 | 吳 | NA | 01/18/2016 13:00:24 |
| 13 | 心理所 1 碩 | U7031 | U760446 | 林 | NA | 01/19/2016 10:48:21 |
| 14 | 心理所 2 碩 | U7031 | U760019 | 胡 | NA | 01/18/2016 12:57:37 |
| 15 | 心理所 2 碩 | U7031 | U760369 | 李 | NA | 02/17/2016 19:13:46 |
dta_2a <- dta_2[-c(1),-c(6)] #remove row1 and column6
dta_2a |> knitr::kable()
| 座號 | 系.年.班 | 開課系序號 | 學號 | 姓名 | 選課時間 | |
|---|---|---|---|---|---|---|
| 2 | 1 | 心理系 3 | U7031 | D840239 | 蘇 | 02/17/2016 09:17:40 |
| 3 | 2 | 心理系 3 | U7031 | D840057 | 吳 | 02/17/2016 09:17:28 |
| 4 | 3 | 心理系 4 | U7031 | D841311 | 余 | 02/17/2016 09:09:10 |
| 5 | 4 | 心理系 4 | U7031 | D840140 | 王 | 02/17/2016 09:09:34 |
| 6 | 5 | 教育所 1 碩 | U3006 | U360098 | 劉 | 01/18/2016 14:56:35 |
| 7 | 6 | 教育所 1 博 | U3006 | U380416 | 陳 | 01/25/2016 16:01:08 |
| 8 | 7 | 教育所 2 碩 | U3006 | U360311 | 林 | 02/17/2016 11:58:44 |
| 9 | 8 | 教育所 2 博 | U3006 | U380020 | 蔡 | 02/17/2016 12:42:06 |
| 10 | 9 | 心理所 1 碩 | U7031 | U760464 | 葉 | 02/17/2016 15:46:51 |
| 11 | 10 | 心理所 1 碩 | U7031 | U760480 | 王 | 02/17/2016 11:39:53 |
| 12 | 11 | 心理所 1 碩 | U7031 | U760420 | 陳 | 02/17/2016 09:19:38 |
| 13 | 12 | 心理所 1 碩 | U7031 | U760038 | 吳 | 01/18/2016 13:00:24 |
| 14 | 13 | 心理所 1 碩 | U7031 | U760446 | 林 | 01/19/2016 10:48:21 |
| 15 | 14 | 心理所 2 碩 | U7031 | U760019 | 胡 | 01/18/2016 12:57:37 |
| 16 | 15 | 心理所 2 碩 | U7031 | U760369 | 李 | 02/17/2016 19:13:46 |
table(dta_2a$系.年.班)
##
## 心理系 3
## 2
## 心理系 4
## 2
## 心理所 1 碩
## 5
## 心理所 2 碩
## 2
## 教育所 1 博
## 1
## 教育所 1 碩
## 1
## 教育所 2 博
## 1
## 教育所 2 碩
## 1
dta_2b<-substr(dta_2a$系.年.班,1,3) #保留dta_2a$系.年.班 第1-3字元存入dta_2b
dta_2b |> knitr::kable()
| x |
|---|
| 心理系 |
| 心理系 |
| 心理系 |
| 心理系 |
| 教育所 |
| 教育所 |
| 教育所 |
| 教育所 |
| 心理所 |
| 心理所 |
| 心理所 |
| 心理所 |
| 心理所 |
| 心理所 |
| 心理所 |
table(dta_2b)
## dta_2b
## 心理系 心理所 教育所
## 4 7 4
dta_2a$系<-substr(dta_2a$系.年.班,1,3) #保留dta_2a$系.年.班 第1-3字元直接再dta_2a做一個column$系
table(dta_2a$系)
##
## 心理系 心理所 教育所
## 4 7 4
1.$系.年.班資料訊息較多,若直接table呈現無法看到全系的資料
2.dta_2b<-substr(dta_2a$系.年.班,1,3))
匯入資料之後,還是應該看一下dta,以確認正確性。
Data on body temperature, gender, and heart rate. are taken from Mackowiak et al. (1992). “A Critical Appraisal of 98.6 Degrees F …,” in the Journal of the American Medical Association (268), 1578-80. Import the file. Find the correlation between body temperature and heart rate and investigate if there is a gender difference in mean temperature.
pacman::p_load(readxl, httr)
pacman::p_load(readxl, httr)
dta_3 <-read_xls( "D:/User/Desktop/datamanagement/20211025/normtemp.xls", sheet=1)
str(dta_3)
## tibble [130 x 3] (S3: tbl_df/tbl/data.frame)
## $ Temp : num [1:130] 96.3 96.7 96.9 97 97.1 97.1 97.1 97.2 97.3 97.4 ...
## $ Sex : num [1:130] 1 1 1 1 1 1 1 1 1 1 ...
## $ Beats: num [1:130] 70 71 74 80 73 75 82 64 69 70 ...
summary(dta_3)
## Temp Sex Beats
## Min. : 96.30 Min. :1.0 Min. :57.00
## 1st Qu.: 97.80 1st Qu.:1.0 1st Qu.:69.00
## Median : 98.30 Median :1.5 Median :74.00
## Mean : 98.25 Mean :1.5 Mean :73.76
## 3rd Qu.: 98.70 3rd Qu.:2.0 3rd Qu.:79.00
## Max. :100.80 Max. :2.0 Max. :89.00
dta_3$Sex_F<-as.factor(dta_3$Sex)
str(dta_3)
## tibble [130 x 4] (S3: tbl_df/tbl/data.frame)
## $ Temp : num [1:130] 96.3 96.7 96.9 97 97.1 97.1 97.1 97.2 97.3 97.4 ...
## $ Sex : num [1:130] 1 1 1 1 1 1 1 1 1 1 ...
## $ Beats: num [1:130] 70 71 74 80 73 75 82 64 69 70 ...
## $ Sex_F: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
class(dta_3)
## [1] "tbl_df" "tbl" "data.frame"
畫性別分組的盒形圖
library(GGally)
## 載入需要的套件:ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
qplot(x=Sex_F,
y=Beats,
data=dta_3,
geom="boxplot", # 圖形=boxplot
xlab="Gender",
ylab="Beats",
main = "Box Plot of Beats by Gender",
color= Sex_F # 以顏色標註性別,複合式的盒形圖
)
# boxplot心得
1.從盒形圖看起來,Gender 1和Gender 2的心跳數感覺上差不多
2.要看Gender 1和Gender 2的心跳數有沒有差異,還是要做t test
attach(dta_3)
dta_3b=dta_3[Sex_F=='1'|Sex_F=='2',] #指定分組
var.test(Beats~Sex_F, dta_3b) #同質性檢定
##
## F test to compare two variances
##
## data: Beats by Sex_F
## F = 0.52543, num df = 64, denom df = 64, p-value = 0.011
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3204830 0.8614301
## sample estimates:
## ratio of variances
## 0.5254272
t.test(Beats~Sex_F, var.equal=T, dta_3b) #獨立樣本t檢定
##
## Two Sample t-test
##
## data: Beats by Sex_F
## t = -0.63191, df = 128, p-value = 0.5286
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -3.241461 1.672230
## sample estimates:
## mean in group 1 mean in group 2
## 73.36923 74.15385
1.將性別分組後做同質性檢定後,兩組資料為同質(F = 0.52543, num df = 64, denom df = 64, p-value = 0.011)
2.做t-test,結果發現性別與心跳並無顯著差異(t = -0.63191, df = 128, p-value = 0.5286)
install.packages(“GGally”)
dta_3a <- dta_3[,-c(2)] #移除Column Sex
library(GGally)
ggpairs(dta_3a)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# ggpairs心得 > 1.從複合圖看到,Beats跟Temp有正相關(Corr:0.254**)
ggplot(data = dta_3, aes(x = Temp, y = Beats)) +
geom_point() +
geom_smooth(method = "lm") #不考慮性別,看Temp和Beats的關係
## `geom_smooth()` using formula 'y ~ x'
require(ggplot2)
qplot(x=Temp,
y=Beats,
data=dta_3,
geom="point", # 圖形=scatter plot
main = "Scatter Plot of Temp-Beats",
xlab="Temp",
ylab="Beats",
color= Sex_F # 以顏色標註月份,複合式的散布圖
)
ggplot(data = dta_3, aes(x = Temp, y = Beats,
color = Sex_F, shape = Sex_F)) +
geom_jitter(alpha = 1/2, size = 3) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
1.從ggplot看到,Beats跟Temp有正相關
2.若以性別分組,Beats跟Temp在不同組別仍有正相關,但在Sex_F=1的組別,正相關較Sex_F=2的弱,兩組分別有沒有達顯著,還是要做檢定。
library(MASS)
fullmodel=lm(Beats~Temp+factor(Sex_F), data=dta_3)
summary(fullmodel)
##
## Call:
## lm(formula = Beats ~ Temp + factor(Sex_F), data = dta_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.6824 -4.6553 0.3581 4.7838 15.8041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -165.24470 82.74978 -1.997 0.04797 *
## Temp 2.43224 0.84344 2.884 0.00462 **
## factor(Sex_F)2 0.08114 1.23203 0.066 0.94760
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.885 on 127 degrees of freedom
## Multiple R-squared: 0.06437, Adjusted R-squared: 0.04964
## F-statistic: 4.369 on 2 and 127 DF, p-value: 0.01462
1.regression model可以看到 full model為顯著
2.Beats=-165.2447+2.43224x(Temp)+e,性別並無影響
3.體溫每上升一度,會增加2.43的心跳速率
4.這邊想要以Sex_F分組做linear regression,嘗試很久做不出來…
A classmate of yours used data.entry() to change the first woman’s height to 50 in the women{datasets}. She then closed the editor and issued plot(women). To her surprise, she got this message: Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’
Explain what had happened. How would you plot the edited data file?
library(datasets)
dta_4<-women
str(dta_4)
## 'data.frame': 15 obs. of 2 variables:
## $ height: num 58 59 60 61 62 63 64 65 66 67 ...
## $ weight: num 115 117 120 123 126 129 132 135 139 142 ...
class(dta_4)
## [1] "data.frame"
head(dta_4)
## height weight
## 1 58 115
## 2 59 117
## 3 60 120
## 4 61 123
## 5 62 126
## 6 63 129
plot(dta_4)
data.entry(dta_4)
str(dta_4)
## List of 2
## $ height: num [1:15] 58 59 60 61 62 63 64 65 66 67 ...
## $ weight: num [1:15] 115 117 120 123 126 129 132 135 139 142 ...
class(dta_4)
## [1] "list"
#plot(dta_4) with error message
knitr::include_graphics("x is a list.png")
plot(dta_4$height, dta_4$weight)
1.使用data.entry()改資料,也會改變資料型態,從data.frame變成list
2.plot無法辨識list的資料,因此要告訴R,對應的資料是甚麼名字
The Ministry of Interior of Taiwan provides many datasets on its website. Download the excel file of Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate to examine the trend of the crude divorce rate over the years.
fL<-"https://www.ris.gov.tw/documents/data/en/3/History-Table-8-2020.xls"
GET(fL, write_disk(tf <- tempfile(fileext=".xls")))
## Response [https://www.ris.gov.tw/documents/data/en/3/History-Table-8-2020.xls]
## Date: 2021-10-31 15:21
## Status: 200
## Content-Type: application/vnd.ms-excel
## Size: 34.3 kB
## <ON DISK> C:\Users\User\AppData\Local\Temp\RtmpEvwXeK\file1e842ab07a71.xls
dta_5 <- read_excel(tf, sheet=1)
## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
dplyr::glimpse(dta_5)
## Rows: 55
## Columns: 5
## $ `Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate` <chr> ~
## $ ...2 <chr> ~
## $ ...3 <chr> ~
## $ ...4 <chr> ~
## $ ...5 <chr> ~
dta_5a <- dta_5[-c(1, 2, 3, 51:56),] #remove row1-3 and column51-56
str(dta_5a)
## tibble [47 x 5] (S3: tbl_df/tbl/data.frame)
## $ Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate: chr [1:47] "1975" "1976" "1977" "1978" ...
## $ ...2 : chr [1:47] "149958" "152240" "156616" "164833" ...
## $ ...3 : chr [1:47] "9.3300000000000001" "9.2799999999999994" "9.3599999999999994" "9.6699999999999999" ...
## $ ...4 : chr [1:47] "7387" "8155" "9259" "10782" ...
## $ ...5 : chr [1:47] "0.46000000000000002" "0.5" "0.56000000000000005" "0.64000000000000001" ...
dplyr::glimpse(dta_5a)
## Rows: 47
## Columns: 5
## $ `Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate` <chr> ~
## $ ...2 <chr> ~
## $ ...3 <chr> ~
## $ ...4 <chr> ~
## $ ...5 <chr> ~
dta_5a$year<-as.numeric(dta_5a$`Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate`) #定義新的column year
## Warning: 強制變更過程中產生了 NA
dta_5a$marrage<-as.numeric(dta_5a$...2)#定義新的column marrage
dta_5a$marragerate<-as.numeric(dta_5a$...3)#定義新的column marragerate
dta_5a$Divorce<-as.numeric(dta_5a$...4)#定義新的column Divorce
dta_5a$Divorcerate<-as.numeric(dta_5a$...5)#定義新的column Divorcerate
dta_5b <- dta_5a[,-c(1:5)] #刪除原來的column 1-5
str(dta_5b)
## tibble [47 x 5] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:47] 1975 1976 1977 1978 1979 ...
## $ marrage : num [1:47] 149958 152240 156616 164833 155941 ...
## $ marragerate: num [1:47] 9.33 9.28 9.36 9.67 8.98 9.67 9.55 8.74 8.59 8.08 ...
## $ Divorce : num [1:47] 7387 8155 9259 10782 12668 ...
## $ Divorcerate: num [1:47] 0.46 0.5 0.56 0.64 0.73 0.77 0.83 0.93 0.95 1.01 ...
tail(dta_5b)
## # A tibble: 6 x 5
## year marrage marragerate Divorce Divorcerate
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2016 148349 6.31 53850 2.29
## 2 2017 137620 5.84 54439 2.31
## 3 2018 135403 5.74 54443 2.31
## 4 2019 133741 5.67 54346 2.3
## 5 2020 120397 5.11 51238 2.19
## 6 NA NA NA NA NA
library(GGally)
P1<-qplot(x=year,
y=Divorcerate,
data=dta_5b,
geom="point", # 圖形=ggplot
xlab="year",
ylab="Divorcerate",
main = "ggplot of divorce rate by year",
)
P1
## Warning: Removed 1 rows containing missing values (geom_point).
P2<-P1 + scale_x_continuous(breaks = seq(from = 1975, to = 2020, by = 10)) # X軸刻度縮小
P2
## Warning: Removed 1 rows containing missing values (geom_point).
- 定義year為as.numeric,才有辦法在X軸上做間隔的定義