inclass1-files in fixed width format

The AAUP2 data set is a comma-delimited fixed column format text file with ’*’ for missing value. Import the file into R and indicate missing values by ‘NA’. Hint: ?read.csv

dta<-read.csv("aaup2dat.txt",header = F)
a<-readr::fwf_empty("aaup2dat.txt")
#計算其間距
i <-a[['end']]-a[['begin']]+1
a

## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
## 
## $col_names
##  [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
## [13] "X13" "X14" "X15" "X16"

##  [1]  6 34  4  4  4  4  4  5  4  4  5  4  4  4  3 NA

讀進資料時發現所有變項都擠在同一個欄位，因此使用readr::fwf_empty，計算其每一欄位的空間隔

# 可以直接在fwf_cols裡取代'*' to NA
dta1 <- readr::read_fwf("aaup2dat.txt", readr::fwf_cols(X1=6, X2=34, X3=4, X4=4, X5=4, X6=4,X7=4,X8=5,X9=4,X10=4,X11=5,X12=4,X13=4,X14=4,X15=4,X16=5), na='*')

#
dplyr::glimpse(dta1)

## Rows: 1,161
## Columns: 16
## $ X1  <dbl> 1061, 1063, 1065, 11462, 1002, 1004, 1008, 1009, 1012, 1016, 1019,~
## $ X2  <chr> "Alaska Pacific University      AK", "Univ.Alaska-Fairbanks       ~
## $ X3  <chr> "IIB", "I", "IIA", "IIA", "IIA", "IIA", "IIB", "I", "IIB", "IIB", ~
## $ X4  <dbl> 454, 686, 533, 612, 442, 441, 466, 580, 498, 506, 339, 461, 360, 3~
## $ X5  <dbl> 382, 560, 494, 507, 369, 385, 394, 437, 379, 412, 303, 389, 304, 3~
## $ X6  <dbl> 362, 432, 329, 414, 310, 310, 351, 374, 322, 359, 287, 338, 258, 2~
## $ X7  <dbl> 382, 508, 415, 498, 350, 388, 396, 455, 401, 411, 301, 386, 300, 2~
## $ X8  <dbl> 567, 914, 716, 825, 530, 542, 558, 692, 655, 607, 421, 585, 433, 4~
## $ X9  <dbl> 485, 753, 663, 681, 444, 473, 476, 527, 501, 508, 371, 496, 369, 4~
## $ X10 <dbl> 471, 572, 442, 557, 376, 383, 427, 451, 404, 445, 347, 436, 313, 3~
## $ X11 <dbl> 487, 677, 559, 670, 423, 477, 478, 546, 523, 503, 366, 493, 363, 3~
## $ X12 <dbl> 6, 74, 9, 115, 59, 57, 20, 366, 34, 67, 8, 106, 27, 17, 18, 83, 23~
## $ X13 <dbl> 11, 125, 26, 124, 77, 33, 18, 354, 25, 40, 15, 42, 25, 19, 28, 46,~
## $ X14 <dbl> 9, 118, 20, 101, 102, 35, 30, 301, 27, 66, 19, 66, 33, 31, 28, 77,~
## $ X15 <dbl> 4, 40, 9, 21, 24, 2, 0, 66, 3, 27, 2, 58, 4, 19, 3, 9, 1, 10, 19, ~
## $ X16 <dbl> 32, 404, 70, 392, 262, 127, 68, 1109, 89, 200, 44, 272, 89, 86, 77~

這邊有點奇怪，X15算出來的間距是3，但實際上卻要切4才不會出現NA

inclass2- input csv file

Here is a copy of the student roster in csv format from NCKU for a course I taught. Dispaly the number of students from each major.

# 讀入資料，並把第一列刪除
dta2<-read.csv("ncku_roster.csv",  header = T )[-1,]
# 擷取每位學生的專業
dta2$major <-substr(dta2[,2],1,3)
str(dta2)

## 'data.frame':    15 obs. of  8 variables:
##  $ 座號      : chr  "1" "2" "3" "4" ...
##  $ 系.年.班  : chr  "心理系           3                               " "心理系           3                               " "心理系           4                               " "心理系           4                               " ...
##  $ 開課系序號: chr  "U7031" "U7031" "U7031" "U7031" ...
##  $ 學號      : chr  "D840239" "D840057" "D841311" "D840140" ...
##  $ 姓名      : chr  "蘇" "吳" "余" "王" ...
##  $ 成績      : logi  NA NA NA NA NA NA ...
##  $ 選課時間  : chr  "02/17/2016 09:17:40  " "02/17/2016 09:17:28  " "02/17/2016 09:09:10  " "02/17/2016 09:09:34  " ...
##  $ major     : chr  "心理系" "心理系" "心理系" "心理系" ...

# show 學號、姓名和專業
dta2[,c(4,5,8)]|>knitr::kable()

	學號	姓名	major
2	D840239	蘇	心理系
3	D840057	吳	心理系
4	D841311	余	心理系
5	D840140	王	心理系
6	U360098	劉	教育所
7	U380416	陳	教育所
8	U360311	林	教育所
9	U380020	蔡	教育所
10	U760464	葉	心理所
11	U760480	王	心理所
12	U760420	陳	心理所
13	U760038	吳	心理所
14	U760446	林	心理所
15	U760019	胡	心理所
16	U760369	李	心理所

#計算freqency
table(dta2$major)|>knitr::kable(col.names=c('專業','人數'))

專業	人數
心理系	4
心理所	7
教育所	4

inclass3- input excel file

Data on body temperature, gender, and heart rate. are taken from Mackowiak et al. (1992). “A Critical Appraisal of 98.6 Degrees F …,” in the Journal of the American Medical Association (268), 1578-80. Import the file. Find the correlation between body temperature and heart rate and investigate if there is a gender difference in mean temperature.

pacman::p_load(readxl)
# input excel
dta3 <-read_excel(path = "./normtemp.xls", sheet = 1, col_names = TRUE, col_types = NULL, na = "NA", skip = 0)|>as.data.frame()
str(dta3)

## 'data.frame':    130 obs. of  3 variables:
##  $ Temp : num  96.3 96.7 96.9 97 97.1 97.1 97.1 97.2 97.3 97.4 ...
##  $ Sex  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Beats: num  70 71 74 80 73 75 82 64 69 70 ...

class(dta3)

## [1] "data.frame"

為什麼讀excel檔須需要給他path?，即便我已設定好file的工作路徑，其他read.table, 可以不用額外再寫path?

summary(dta3)

##       Temp             Sex          Beats      
##  Min.   : 96.30   Min.   :1.0   Min.   :57.00  
##  1st Qu.: 97.80   1st Qu.:1.0   1st Qu.:69.00  
##  Median : 98.30   Median :1.5   Median :74.00  
##  Mean   : 98.25   Mean   :1.5   Mean   :73.76  
##  3rd Qu.: 98.70   3rd Qu.:2.0   3rd Qu.:79.00  
##  Max.   :100.80   Max.   :2.0   Max.   :89.00

# plot
with(dta3, plot(Temp,Beats, xlab="temp", ylab="beats"))
abline(lm(Beats~Temp, data= dta3))

#correlation
with(dta3,cor(Temp ,Beats))

## [1] 0.2536564

# boxplot
with(dta3, boxplot(Beats~as.factor(Sex), horizontal = F, frame=F, col="aliceblue", varwidth=T))

boxplot看起來兩組心跳速率很接近

#compare Temp, Beats mean by different sex
show(mss <- aggregate(cbind(Temp,Beats)~Sex, data = dta3, FUN=mean))

##   Sex     Temp    Beats
## 1   1 98.10462 73.36923
## 2   2 98.39385 74.15385

不同性別在溫度與心跳速率平均值其實很接近

# xyplot
library(lattice)
xyplot(Beats ~ Temp, groups = Sex, data=dta3,type = c("g","p","r"))

相對藍色點，紅色點其溫度越高心跳速率變化大。

#Extract regression coefficients
sapply(split(dta3, dta3$Sex), function(x) coef(lm(x$Beats~x$Temp))) |> knitr::kable()

	1	2
(Intercept)	-87.966581	-233.623768
x$Temp	1.644528	3.128017

split將dta依性別拆成兩個subdata，並透過sapply進行兩性在溫度與心跳的回歸分析，結果顯示， Sex=2，溫度增加1心跳增加3.12，而Sex=1，溫度增加1心跳增加1.64

結論：兩性在溫度對心跳的影響有所差異

inclass4-data.entry and structure of data

A classmate of yours used data.entry() to change the first woman’s height to 50 in the women{datasets}. She then closed the editor and issued plot(women). To her surprise, she got this message: Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’

Explain what had happened. How would you plot the edited data file?

data(women)
plot(women)

dplyr::glimpse(women)

## Rows: 15
## Columns: 2
## $ height <dbl> 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72
## $ weight <dbl> 115, 117, 120, 123, 126, 129, 132, 135, 139, 142, 146, 150, 154~

原本資料型態是data.frame, 類型是double

dta4<-women
#data.entry(dta4)
plot(dta4)

後來經過data.entry()修改第一位女性身高後，其錯誤訊息：

Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’

告知因資料型態為list，因此R抓不到變項

plot(dta4$height,dta4$weight)

可以透過$variable方式抓取變項

inclass5

The Ministry of Interior of Taiwan provides many datasets on its website. Download the excel file of Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate to examine the trend of the crude divorce rate over the years.

# 讀入檔案並指保留年代與離婚率兩個變項
dta5 <-read_excel(path = "./Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate.xls", sheet = 1, col_names = F, col_types = NULL, na = "")[-c(1:4,51:56),c(1,5)] |>as.data.frame()
# 給予欄位名稱
names(dta5) <-c("Year","DivorceRate")
# 將離婚率僅呈現小數位後兩位
dta5$DivorceRate <-round(as.numeric(dta5$DivorceRate),2)
# 針對年代進行排序，用以初步了解x軸的範圍
dta5[order(dta5$Year),]

##    Year DivorceRate
## 1  1975        0.46
## 2  1976        0.50
## 3  1977        0.56
## 4  1978        0.64
## 5  1979        0.73
## 6  1980        0.77
## 7  1981        0.83
## 8  1982        0.93
## 9  1983        0.95
## 10 1984        1.01
## 11 1985        1.08
## 12 1986        1.15
## 13 1987        1.17
## 14 1988        1.26
## 15 1989        1.26
## 16 1990        1.36
## 17 1991        1.38
## 18 1992        1.41
## 19 1993        1.45
## 20 1994        1.51
## 21 1995        1.56
## 22 1996        1.68
## 23 1997        1.80
## 24 1998        2.00
## 25 1999        2.23
## 26 2000        2.38
## 27 2001        2.53
## 28 2002        2.73
## 29 2003        2.88
## 30 2004        2.77
## 31 2005        2.76
## 32 2006        2.82
## 33 2007        2.55
## 34 2008        2.44
## 35 2009        2.48
## 36 2010        2.51
## 37 2011        2.46
## 38 2012        2.40
## 39 2013        2.30
## 40 2014        2.27
## 41 2015        2.28
## 42 2016        2.29
## 43 2017        2.31
## 44 2018        2.31
## 45 2019        2.30
## 46 2020        2.19

# 用以初步了解離婚率的最大最小值
summary(dta5)

##      Year            DivorceRate   
##  Length:46          Min.   :0.460  
##  Class :character   1st Qu.:1.155  
##  Mode  :character   Median :1.900  
##                     Mean   :1.775  
##                     3rd Qu.:2.395  
##                     Max.   :2.880

#
plot(dta5, xlab ="Year", ylab = "Divorce Rate", xlim=c(1975,2020), ylim=c(0.4,3))

1025HW

Jenny Hsu

2021/10/27

inclass1-files in fixed width format

inclass2- input csv file

inclass3- input excel file

inclass4-data.entry and structure of data

inclass5