Inclass_1

The AAUP2 data set is a comma-delimited fixed column format text file with ’*’ for missing value. Import the file into R and indicate missing values by ‘NA’. Hint: ?read.csv

fL <- "D:/User/Desktop/datamanagement/20211025/aaup2.txt"
dta_1 <- read.table(fL, header=T, sep='\t')
readr::fwf_empty("/User/Desktop/datamanagement/20211025/aaup2.txt")[1:2]
## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
#直接匯入txt發現資料很亂,檢視txt原檔,資料確實一團,嘗試用readr::fwf_empty

dta_1 <- readr::read_fwf("/User/Desktop/datamanagement/20211025/aaup2.txt", skip=1,readr::fwf_cols(X1=6, X2=34, X3=4, X4=4, X5=4, X6=4,X7=4,X8=5,X9=4,X10=4,X11=5,X12=4,X13=4,X14=4,X15=3,X16=5))
## Rows: 1160 Columns: 16
## -- Column specification --------------------------------------------------------
## 
## chr (9): X2, X3, X4, X5, X6, X8, X9, X10, X16
## dbl (7): X1, X7, X11, X12, X13, X14, X15
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dplyr::glimpse(dta_1)  #glimpse(), 類似 {R} base 的 str() 函式
## Rows: 1,160
## Columns: 16
## $ X1  <dbl> 1063, 1065, 11462, 1002, 1004, 1008, 1009, 1012, 1016, 1019, 1020,~
## $ X2  <chr> "Univ.Alaska-Fairbanks          AK", "Univ.Alaska-Southeast       ~
## $ X3  <chr> "I", "IIA", "IIA", "IIA", "IIA", "IIB", "I", "IIB", "IIB", "IIB", ~
## $ X4  <chr> "686", "533", "612", "442", "441", "466", "580", "498", "506", "33~
## $ X5  <chr> "560", "494", "507", "369", "385", "394", "437", "379", "412", "30~
## $ X6  <chr> "432", "329", "414", "310", "310", "351", "374", "322", "359", "28~
## $ X7  <dbl> 508, 415, 498, 350, 388, 396, 455, 401, 411, 301, 386, 300, 291, 2~
## $ X8  <chr> "914", "716", "825", "530", "542", "558", "692", "655", "607", "42~
## $ X9  <chr> "753", "663", "681", "444", "473", "476", "527", "501", "508", "37~
## $ X10 <chr> "572", "442", "557", "376", "383", "427", "451", "404", "445", "34~
## $ X11 <dbl> 677, 559, 670, 423, 477, 478, 546, 523, 503, 366, 493, 363, 363, 3~
## $ X12 <dbl> 74, 9, 115, 59, 57, 20, 366, 34, 67, 8, 106, 27, 17, 18, 83, 23, 1~
## $ X13 <dbl> 125, 26, 124, 77, 33, 18, 354, 25, 40, 15, 42, 25, 19, 28, 46, 17,~
## $ X14 <dbl> 118, 20, 101, 102, 35, 30, 301, 27, 66, 19, 66, 33, 31, 28, 77, 14~
## $ X15 <dbl> 4, NA, 2, 2, NA, NA, 6, NA, 2, NA, 5, NA, 1, NA, NA, NA, 1, 1, 4, ~
## $ X16 <chr> "0  40", "9   7", "1  39", "4  26", "2  12", "0   6", "6 110", "3 ~
str(dta_1)
## spec_tbl_df [1,160 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X1 : num [1:1160] 1063 1065 11462 1002 1004 ...
##  $ X2 : chr [1:1160] "Univ.Alaska-Fairbanks          AK" "Univ.Alaska-Southeast          AK" "Univ.Alaska-Anchorage          AK" "Alabama Agri.&Mech. Univ.      AL" ...
##  $ X3 : chr [1:1160] "I" "IIA" "IIA" "IIA" ...
##  $ X4 : chr [1:1160] "686" "533" "612" "442" ...
##  $ X5 : chr [1:1160] "560" "494" "507" "369" ...
##  $ X6 : chr [1:1160] "432" "329" "414" "310" ...
##  $ X7 : num [1:1160] 508 415 498 350 388 396 455 401 411 301 ...
##  $ X8 : chr [1:1160] "914" "716" "825" "530" ...
##  $ X9 : chr [1:1160] "753" "663" "681" "444" ...
##  $ X10: chr [1:1160] "572" "442" "557" "376" ...
##  $ X11: num [1:1160] 677 559 670 423 477 478 546 523 503 366 ...
##  $ X12: num [1:1160] 74 9 115 59 57 20 366 34 67 8 ...
##  $ X13: num [1:1160] 125 26 124 77 33 18 354 25 40 15 ...
##  $ X14: num [1:1160] 118 20 101 102 35 30 301 27 66 19 ...
##  $ X15: num [1:1160] 4 NA 2 2 NA NA 6 NA 2 NA ...
##  $ X16: chr [1:1160] "0  40" "9   7" "1  39" "4  26" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   X2 = col_character(),
##   ..   X3 = col_character(),
##   ..   X4 = col_character(),
##   ..   X5 = col_character(),
##   ..   X6 = col_character(),
##   ..   X7 = col_double(),
##   ..   X8 = col_character(),
##   ..   X9 = col_character(),
##   ..   X10 = col_character(),
##   ..   X11 = col_double(),
##   ..   X12 = col_double(),
##   ..   X13 = col_double(),
##   ..   X14 = col_double(),
##   ..   X15 = col_double(),
##   ..   X16 = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
dta_1 |> as.data.frame() |> head()
##      X1                                X2  X3  X4  X5  X6  X7  X8  X9 X10 X11
## 1  1063 Univ.Alaska-Fairbanks          AK   I 686 560 432 508 914 753 572 677
## 2  1065 Univ.Alaska-Southeast          AK IIA 533 494 329 415 716 663 442 559
## 3 11462 Univ.Alaska-Anchorage          AK IIA 612 507 414 498 825 681 557 670
## 4  1002 Alabama Agri.&Mech. Univ.      AL IIA 442 369 310 350 530 444 376 423
## 5  1004 University of Montevallo       AL IIA 441 385 310 388 542 473 383 477
## 6  1008 Athens State College           AL IIB 466 394 351 396 558 476 427 478
##   X12 X13 X14 X15   X16
## 1  74 125 118   4 0  40
## 2   9  26  20  NA 9   7
## 3 115 124 101   2 1  39
## 4  59  77 102   2 4  26
## 5  57  33  35  NA 2  12
## 6  20  18  30  NA 0   6

心得:

1.直接匯入txt發現資料很亂,檢視txt原檔,資料確實一團,嘗試用readr::fwf_empty做資料間的字元區隔

2.依照begin的結果設定字元數間格 $begin [1] 0 6 40 45 49 53 57 61 66 70 74 79 83 87 92 95 $end [1] 5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA

3.但做成dta |> as.data.frame()發現X15(94-92+1=3?).X16(NA-95),隨意設一個比較大的值6)之間的資料很亂?看dta_1 talbe發現X15.X16的資料沒有切清楚(X15的資料字元超過3),導致於X15很多NA

4.glimpse()和str() 函式功能類似

fL <- "D:/User/Desktop/datamanagement/20211025/aaup2.txt"
dta_1a <- read.table(fL, sep='\t')
readr::fwf_empty("/User/Desktop/datamanagement/20211025/aaup2.txt")[1:2]
## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
dta_1a <- readr::read_fwf("/User/Desktop/datamanagement/20211025/aaup2.txt", skip=1,readr::fwf_cols(X1=6, X2=34, X3=4, X4=4, X5=4, X6=4,X7=4,X8=5,X9=4,X10=4,X11=5,X12=4,X13=4,X14=4,X15=4,X16=5))  #X15改成=4
## Rows: 1160 Columns: 16
## -- Column specification --------------------------------------------------------
## 
## chr (8): X2, X3, X4, X5, X6, X8, X9, X10
## dbl (8): X1, X7, X11, X12, X13, X14, X15, X16
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
dplyr::glimpse(dta_1a)
## Rows: 1,160
## Columns: 16
## $ X1  <dbl> 1063, 1065, 11462, 1002, 1004, 1008, 1009, 1012, 1016, 1019, 1020,~
## $ X2  <chr> "Univ.Alaska-Fairbanks          AK", "Univ.Alaska-Southeast       ~
## $ X3  <chr> "I", "IIA", "IIA", "IIA", "IIA", "IIB", "I", "IIB", "IIB", "IIB", ~
## $ X4  <chr> "686", "533", "612", "442", "441", "466", "580", "498", "506", "33~
## $ X5  <chr> "560", "494", "507", "369", "385", "394", "437", "379", "412", "30~
## $ X6  <chr> "432", "329", "414", "310", "310", "351", "374", "322", "359", "28~
## $ X7  <dbl> 508, 415, 498, 350, 388, 396, 455, 401, 411, 301, 386, 300, 291, 2~
## $ X8  <chr> "914", "716", "825", "530", "542", "558", "692", "655", "607", "42~
## $ X9  <chr> "753", "663", "681", "444", "473", "476", "527", "501", "508", "37~
## $ X10 <chr> "572", "442", "557", "376", "383", "427", "451", "404", "445", "34~
## $ X11 <dbl> 677, 559, 670, 423, 477, 478, 546, 523, 503, 366, 493, 363, 363, 3~
## $ X12 <dbl> 74, 9, 115, 59, 57, 20, 366, 34, 67, 8, 106, 27, 17, 18, 83, 23, 1~
## $ X13 <dbl> 125, 26, 124, 77, 33, 18, 354, 25, 40, 15, 42, 25, 19, 28, 46, 17,~
## $ X14 <dbl> 118, 20, 101, 102, 35, 30, 301, 27, 66, 19, 66, 33, 31, 28, 77, 14~
## $ X15 <dbl> 40, 9, 21, 24, 2, 0, 66, 3, 27, 2, 58, 4, 19, 3, 9, 1, 10, 19, 45,~
## $ X16 <dbl> 404, 70, 392, 262, 127, 68, 1109, 89, 200, 44, 272, 89, 86, 77, 21~
dta_1a |> as.data.frame() |> head()
##      X1                                X2  X3  X4  X5  X6  X7  X8  X9 X10 X11
## 1  1063 Univ.Alaska-Fairbanks          AK   I 686 560 432 508 914 753 572 677
## 2  1065 Univ.Alaska-Southeast          AK IIA 533 494 329 415 716 663 442 559
## 3 11462 Univ.Alaska-Anchorage          AK IIA 612 507 414 498 825 681 557 670
## 4  1002 Alabama Agri.&Mech. Univ.      AL IIA 442 369 310 350 530 444 376 423
## 5  1004 University of Montevallo       AL IIA 441 385 310 388 542 473 383 477
## 6  1008 Athens State College           AL IIB 466 394 351 396 558 476 427 478
##   X12 X13 X14 X15 X16
## 1  74 125 118  40 404
## 2   9  26  20   9  70
## 3 115 124 101  21 392
## 4  59  77 102  24 262
## 5  57  33  35   2 127
## 6  20  18  30   0  68
str(dta_1a)
## spec_tbl_df [1,160 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X1 : num [1:1160] 1063 1065 11462 1002 1004 ...
##  $ X2 : chr [1:1160] "Univ.Alaska-Fairbanks          AK" "Univ.Alaska-Southeast          AK" "Univ.Alaska-Anchorage          AK" "Alabama Agri.&Mech. Univ.      AL" ...
##  $ X3 : chr [1:1160] "I" "IIA" "IIA" "IIA" ...
##  $ X4 : chr [1:1160] "686" "533" "612" "442" ...
##  $ X5 : chr [1:1160] "560" "494" "507" "369" ...
##  $ X6 : chr [1:1160] "432" "329" "414" "310" ...
##  $ X7 : num [1:1160] 508 415 498 350 388 396 455 401 411 301 ...
##  $ X8 : chr [1:1160] "914" "716" "825" "530" ...
##  $ X9 : chr [1:1160] "753" "663" "681" "444" ...
##  $ X10: chr [1:1160] "572" "442" "557" "376" ...
##  $ X11: num [1:1160] 677 559 670 423 477 478 546 523 503 366 ...
##  $ X12: num [1:1160] 74 9 115 59 57 20 366 34 67 8 ...
##  $ X13: num [1:1160] 125 26 124 77 33 18 354 25 40 15 ...
##  $ X14: num [1:1160] 118 20 101 102 35 30 301 27 66 19 ...
##  $ X15: num [1:1160] 40 9 21 24 2 0 66 3 27 2 ...
##  $ X16: num [1:1160] 404 70 392 262 127 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   X2 = col_character(),
##   ..   X3 = col_character(),
##   ..   X4 = col_character(),
##   ..   X5 = col_character(),
##   ..   X6 = col_character(),
##   ..   X7 = col_double(),
##   ..   X8 = col_character(),
##   ..   X9 = col_character(),
##   ..   X10 = col_character(),
##   ..   X11 = col_double(),
##   ..   X12 = col_double(),
##   ..   X13 = col_double(),
##   ..   X14 = col_double(),
##   ..   X15 = col_double(),
##   ..   X16 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

心得:

1.readr::fwf_cols(…X15=4,X16=5),資料就切乾淨了。

匯入資料之後,還是應該看一下dta,以確認正確性。

Inclass_2

Here is a copy of the student roster in csv format from NCKU for a course I taught. Dispaly the number of students from each major.

dta_2<-read.csv("D:/User/Desktop/datamanagement/20211025/ncku_roster.csv")
dta_2 |> knitr::kable()
座號 系.年.班 開課系序號 學號 姓名 成績 選課時間
教師:U3023 許清芳 上課時間: 一[6-8];開課號:U3006 U7031 科目:資料管理 NA
1 心理系 3 U7031 D840239 NA 02/17/2016 09:17:40
2 心理系 3 U7031 D840057 NA 02/17/2016 09:17:28
3 心理系 4 U7031 D841311 NA 02/17/2016 09:09:10
4 心理系 4 U7031 D840140 NA 02/17/2016 09:09:34
5 教育所 1 碩 U3006 U360098 NA 01/18/2016 14:56:35
6 教育所 1 博 U3006 U380416 NA 01/25/2016 16:01:08
7 教育所 2 碩 U3006 U360311 NA 02/17/2016 11:58:44
8 教育所 2 博 U3006 U380020 NA 02/17/2016 12:42:06
9 心理所 1 碩 U7031 U760464 NA 02/17/2016 15:46:51
10 心理所 1 碩 U7031 U760480 NA 02/17/2016 11:39:53
11 心理所 1 碩 U7031 U760420 NA 02/17/2016 09:19:38
12 心理所 1 碩 U7031 U760038 NA 01/18/2016 13:00:24
13 心理所 1 碩 U7031 U760446 NA 01/19/2016 10:48:21
14 心理所 2 碩 U7031 U760019 NA 01/18/2016 12:57:37
15 心理所 2 碩 U7031 U760369 NA 02/17/2016 19:13:46
dta_2a <- dta_2[-c(1),-c(6)]  #remove row1 and column6
dta_2a |> knitr::kable()
座號 系.年.班 開課系序號 學號 姓名 選課時間
2 1 心理系 3 U7031 D840239 02/17/2016 09:17:40
3 2 心理系 3 U7031 D840057 02/17/2016 09:17:28
4 3 心理系 4 U7031 D841311 02/17/2016 09:09:10
5 4 心理系 4 U7031 D840140 02/17/2016 09:09:34
6 5 教育所 1 碩 U3006 U360098 01/18/2016 14:56:35
7 6 教育所 1 博 U3006 U380416 01/25/2016 16:01:08
8 7 教育所 2 碩 U3006 U360311 02/17/2016 11:58:44
9 8 教育所 2 博 U3006 U380020 02/17/2016 12:42:06
10 9 心理所 1 碩 U7031 U760464 02/17/2016 15:46:51
11 10 心理所 1 碩 U7031 U760480 02/17/2016 11:39:53
12 11 心理所 1 碩 U7031 U760420 02/17/2016 09:19:38
13 12 心理所 1 碩 U7031 U760038 01/18/2016 13:00:24
14 13 心理所 1 碩 U7031 U760446 01/19/2016 10:48:21
15 14 心理所 2 碩 U7031 U760019 01/18/2016 12:57:37
16 15 心理所 2 碩 U7031 U760369 02/17/2016 19:13:46
table(dta_2a$系.年.班)
## 
## 心理系           3                                
##                                                 2 
## 心理系           4                                
##                                                 2 
## 心理所           1 碩                             
##                                                 5 
## 心理所           2 碩                             
##                                                 2 
## 教育所           1 博                             
##                                                 1 
## 教育所           1 碩                             
##                                                 1 
## 教育所           2 博                             
##                                                 1 
## 教育所           2 碩                             
##                                                 1
dta_2b<-substr(dta_2a$系.年.班,1,3)  #保留dta_2a$系.年.班 第1-3字元存入dta_2b
dta_2b |> knitr::kable()
x
心理系
心理系
心理系
心理系
教育所
教育所
教育所
教育所
心理所
心理所
心理所
心理所
心理所
心理所
心理所
table(dta_2b)
## dta_2b
## 心理系 心理所 教育所 
##      4      7      4
dta_2a$系<-substr(dta_2a$系.年.班,1,3)  #保留dta_2a$系.年.班 第1-3字元直接再dta_2a做一個column$系
table(dta_2a$系)
## 
## 心理系 心理所 教育所 
##      4      7      4

心得:

1.$系.年.班資料訊息較多,若直接table呈現無法看到全系的資料

2.dta_2b<-substr(dta_2a$系.年.班,1,3))

匯入資料之後,還是應該看一下dta,以確認正確性。

Inclass_3

Data on body temperature, gender, and heart rate. are taken from Mackowiak et al. (1992). “A Critical Appraisal of 98.6 Degrees F …,” in the Journal of the American Medical Association (268), 1578-80. Import the file. Find the correlation between body temperature and heart rate and investigate if there is a gender difference in mean temperature.

pacman::p_load(readxl, httr)

pacman::p_load(readxl, httr)
dta_3 <-read_xls( "D:/User/Desktop/datamanagement/20211025/normtemp.xls", sheet=1)
str(dta_3)
## tibble [130 x 3] (S3: tbl_df/tbl/data.frame)
##  $ Temp : num [1:130] 96.3 96.7 96.9 97 97.1 97.1 97.1 97.2 97.3 97.4 ...
##  $ Sex  : num [1:130] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Beats: num [1:130] 70 71 74 80 73 75 82 64 69 70 ...
summary(dta_3)
##       Temp             Sex          Beats      
##  Min.   : 96.30   Min.   :1.0   Min.   :57.00  
##  1st Qu.: 97.80   1st Qu.:1.0   1st Qu.:69.00  
##  Median : 98.30   Median :1.5   Median :74.00  
##  Mean   : 98.25   Mean   :1.5   Mean   :73.76  
##  3rd Qu.: 98.70   3rd Qu.:2.0   3rd Qu.:79.00  
##  Max.   :100.80   Max.   :2.0   Max.   :89.00
dta_3$Sex_F<-as.factor(dta_3$Sex)
str(dta_3)
## tibble [130 x 4] (S3: tbl_df/tbl/data.frame)
##  $ Temp : num [1:130] 96.3 96.7 96.9 97 97.1 97.1 97.1 97.2 97.3 97.4 ...
##  $ Sex  : num [1:130] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Beats: num [1:130] 70 71 74 80 73 75 82 64 69 70 ...
##  $ Sex_F: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
class(dta_3)
## [1] "tbl_df"     "tbl"        "data.frame"

boxplot

畫性別分組的盒形圖

library(GGally)
## 載入需要的套件:ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
qplot(x=Sex_F,                               
      y=Beats,
      data=dta_3,                     
      geom="boxplot",       # 圖形=boxplot
      xlab="Gender",
      ylab="Beats",
      main = "Box Plot of Beats by Gender", 
      color= Sex_F          # 以顏色標註性別,複合式的盒形圖
)

# boxplot心得

1.從盒形圖看起來,Gender 1和Gender 2的心跳數感覺上差不多

2.要看Gender 1和Gender 2的心跳數有沒有差異,還是要做t test

t-test by gender

attach(dta_3)
dta_3b=dta_3[Sex_F=='1'|Sex_F=='2',]  #指定分組
var.test(Beats~Sex_F, dta_3b)  #同質性檢定
## 
##  F test to compare two variances
## 
## data:  Beats by Sex_F
## F = 0.52543, num df = 64, denom df = 64, p-value = 0.011
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3204830 0.8614301
## sample estimates:
## ratio of variances 
##          0.5254272
t.test(Beats~Sex_F, var.equal=T, dta_3b) #獨立樣本t檢定
## 
##  Two Sample t-test
## 
## data:  Beats by Sex_F
## t = -0.63191, df = 128, p-value = 0.5286
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -3.241461  1.672230
## sample estimates:
## mean in group 1 mean in group 2 
##        73.36923        74.15385

t-test by gender心得

1.將性別分組後做同質性檢定後,兩組資料為同質(F = 0.52543, num df = 64, denom df = 64, p-value = 0.011)

2.做t-test,結果發現性別與心跳並無顯著差異(t = -0.63191, df = 128, p-value = 0.5286)

ggpairs

install.packages(“GGally”)

dta_3a <- dta_3[,-c(2)]  #移除Column Sex
library(GGally)
ggpairs(dta_3a)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# ggpairs心得 > 1.從複合圖看到,Beats跟Temp有正相關(Corr:0.254**)

ggplot

ggplot(data = dta_3, aes(x = Temp, y = Beats)) + 
  geom_point() + 
  geom_smooth(method = "lm")  #不考慮性別,看Temp和Beats的關係
## `geom_smooth()` using formula 'y ~ x'

require(ggplot2)
qplot(x=Temp,                               
      y=Beats,                              
      data=dta_3,                      
      geom="point",                         # 圖形=scatter plot
      main = "Scatter Plot of Temp-Beats",  
      xlab="Temp",                          
      ylab="Beats",                    
      color= Sex_F                          # 以顏色標註月份,複合式的散布圖
      )

ggplot(data = dta_3, aes(x = Temp, y = Beats, 
                      color = Sex_F, shape = Sex_F)) + 
  geom_jitter(alpha = 1/2, size = 3) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot 心得

1.從ggplot看到,Beats跟Temp有正相關

2.若以性別分組,Beats跟Temp在不同組別仍有正相關,但在Sex_F=1的組別,正相關較Sex_F=2的弱,兩組分別有沒有達顯著,還是要做檢定。

線性迴歸

library(MASS)
fullmodel=lm(Beats~Temp+factor(Sex_F), data=dta_3)
summary(fullmodel)
## 
## Call:
## lm(formula = Beats ~ Temp + factor(Sex_F), data = dta_3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.6824  -4.6553   0.3581   4.7838  15.8041 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    -165.24470   82.74978  -1.997  0.04797 * 
## Temp              2.43224    0.84344   2.884  0.00462 **
## factor(Sex_F)2    0.08114    1.23203   0.066  0.94760   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.885 on 127 degrees of freedom
## Multiple R-squared:  0.06437,    Adjusted R-squared:  0.04964 
## F-statistic: 4.369 on 2 and 127 DF,  p-value: 0.01462

線性迴歸 心得

1.regression model可以看到 full model為顯著

2.Beats=-165.2447+2.43224x(Temp)+e,性別並無影響

3.體溫每上升一度,會增加2.43的心跳速率

4.這邊想要以Sex_F分組做linear regression,嘗試很久做不出來…

Inclass_4

A classmate of yours used data.entry() to change the first woman’s height to 50 in the women{datasets}. She then closed the editor and issued plot(women). To her surprise, she got this message: Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’

Explain what had happened. How would you plot the edited data file?

library(datasets)
dta_4<-women

str(dta_4)
## 'data.frame':    15 obs. of  2 variables:
##  $ height: num  58 59 60 61 62 63 64 65 66 67 ...
##  $ weight: num  115 117 120 123 126 129 132 135 139 142 ...
class(dta_4)
## [1] "data.frame"
head(dta_4)
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
plot(dta_4)

data.entry(dta_4)
str(dta_4)
## List of 2
##  $ height: num [1:15] 58 59 60 61 62 63 64 65 66 67 ...
##  $ weight: num [1:15] 115 117 120 123 126 129 132 135 139 142 ...
class(dta_4)
## [1] "list"
#plot(dta_4) with error message
knitr::include_graphics("x is a list.png")

plot(dta_4$height, dta_4$weight)

Inclass_4 心得

1.使用data.entry()改資料,也會改變資料型態,從data.frame變成list

2.plot無法辨識list的資料,因此要告訴R,對應的資料是甚麼名字

Inclass_5

The Ministry of Interior of Taiwan provides many datasets on its website. Download the excel file of Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate to examine the trend of the crude divorce rate over the years.

fL<-"https://www.ris.gov.tw/documents/data/en/3/History-Table-8-2020.xls"
GET(fL, write_disk(tf <- tempfile(fileext=".xls")))
## Response [https://www.ris.gov.tw/documents/data/en/3/History-Table-8-2020.xls]
##   Date: 2021-10-31 15:21
##   Status: 200
##   Content-Type: application/vnd.ms-excel
##   Size: 34.3 kB
## <ON DISK>  C:\Users\User\AppData\Local\Temp\RtmpEvwXeK\file1e842ab07a71.xls
dta_5 <- read_excel(tf, sheet=1)
## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
dplyr::glimpse(dta_5) 
## Rows: 55
## Columns: 5
## $ `Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate` <chr> ~
## $ ...2                                                                                  <chr> ~
## $ ...3                                                                                  <chr> ~
## $ ...4                                                                                  <chr> ~
## $ ...5                                                                                  <chr> ~
dta_5a <- dta_5[-c(1, 2, 3, 51:56),]  #remove row1-3 and column51-56
str(dta_5a)
## tibble [47 x 5] (S3: tbl_df/tbl/data.frame)
##  $ Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate: chr [1:47] "1975" "1976" "1977" "1978" ...
##  $ ...2                                                                               : chr [1:47] "149958" "152240" "156616" "164833" ...
##  $ ...3                                                                               : chr [1:47] "9.3300000000000001" "9.2799999999999994" "9.3599999999999994" "9.6699999999999999" ...
##  $ ...4                                                                               : chr [1:47] "7387" "8155" "9259" "10782" ...
##  $ ...5                                                                               : chr [1:47] "0.46000000000000002" "0.5" "0.56000000000000005" "0.64000000000000001" ...
dplyr::glimpse(dta_5a)
## Rows: 47
## Columns: 5
## $ `Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate` <chr> ~
## $ ...2                                                                                  <chr> ~
## $ ...3                                                                                  <chr> ~
## $ ...4                                                                                  <chr> ~
## $ ...5                                                                                  <chr> ~
dta_5a$year<-as.numeric(dta_5a$`Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate`) #定義新的column year
## Warning: 強制變更過程中產生了 NA
dta_5a$marrage<-as.numeric(dta_5a$...2)#定義新的column marrage
dta_5a$marragerate<-as.numeric(dta_5a$...3)#定義新的column marragerate
dta_5a$Divorce<-as.numeric(dta_5a$...4)#定義新的column Divorce
dta_5a$Divorcerate<-as.numeric(dta_5a$...5)#定義新的column Divorcerate
dta_5b <- dta_5a[,-c(1:5)] #刪除原來的column 1-5
str(dta_5b)
## tibble [47 x 5] (S3: tbl_df/tbl/data.frame)
##  $ year       : num [1:47] 1975 1976 1977 1978 1979 ...
##  $ marrage    : num [1:47] 149958 152240 156616 164833 155941 ...
##  $ marragerate: num [1:47] 9.33 9.28 9.36 9.67 8.98 9.67 9.55 8.74 8.59 8.08 ...
##  $ Divorce    : num [1:47] 7387 8155 9259 10782 12668 ...
##  $ Divorcerate: num [1:47] 0.46 0.5 0.56 0.64 0.73 0.77 0.83 0.93 0.95 1.01 ...
tail(dta_5b)
## # A tibble: 6 x 5
##    year marrage marragerate Divorce Divorcerate
##   <dbl>   <dbl>       <dbl>   <dbl>       <dbl>
## 1  2016  148349        6.31   53850        2.29
## 2  2017  137620        5.84   54439        2.31
## 3  2018  135403        5.74   54443        2.31
## 4  2019  133741        5.67   54346        2.3 
## 5  2020  120397        5.11   51238        2.19
## 6    NA      NA       NA         NA       NA

ggplot for year to divorce rate

library(GGally)
P1<-qplot(x=year,                               
      y=Divorcerate,
      data=dta_5b,                     
      geom="point",       # 圖形=ggplot
      xlab="year",
      ylab="Divorcerate",
      main = "ggplot of divorce rate by year", 
)

P1
## Warning: Removed 1 rows containing missing values (geom_point).

P2<-P1 + scale_x_continuous(breaks = seq(from = 1975, to = 2020, by = 10))   # X軸刻度縮小
P2   
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot 心得

  1. 定義year為as.numeric,才有辦法在X軸上做間隔的定義