1 In-class 1

AAUP Faculty Salary data

The AAUP2 data set is a comma-delimited fixed column format text file with ’*’ for missing value. Import the file into R and indicate missing values by ‘NA’. Hint: ?read.csv

1.1 Input data

# search the number of gap (blank)
dat1 <- readr::fwf_empty("aaup2.dat.txt")
dat1
## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
## 
## $col_names
##  [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
## [13] "X13" "X14" "X15" "X16"
# column base AAUP Faculty Salary data infomation
dat1 <- readr::read_fwf("aaup2.dat.txt", skip = 0, na = "*",
                        readr::fwf_cols(FICE=5,
                                        College=31,
                                        State=3,
                                        Type=4,
                                        AverageSalary_FullProfessors = 5,
                                        AverageSalary_AssociateProfessors = 4,
                                        AverageSalary_AssistantProfessors = 4,
                                        AverageSalary_AllRanks = 4,
                                        AverageCompensation_FullProfessors = 5,
                                        AverageCompensation_AssociateProfessors = 4,
                                        AverageCompensation_AssistantProfessors = 4,
                                        AverageCompensation_AllRanks = 5,
                                        Number_FullProfessors = 4,
                                        Number_AssociateProfessors=4,
                                        Number_AssistantProfessors=4,
                                        Number_instructors=4,
                                        Number_AllRanks=5
                                        ))
## Rows: 1161 Columns: 17
## -- Column specification --------------------------------------------------------
## 
## chr  (3): College, State, Type
## dbl (14): FICE, AverageSalary_FullProfessors, AverageSalary_AssociateProfess...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

1.2 Table

# Table
DT::datatable(dat1, rownames=FALSE, options=list(pageLength=5))
# dat1 |> as.data.frame()|>head()

2 In-class 2

Here is a copy of the student roster in csv format from NCKU for a course I taught. Display the number of students from each major.

2.1 Input data

# [-1: remove 1st row, c(2,4,5): choose column 2, 4, 5 ]
dat2 <- read.csv("ncku_roster.csv", header = TRUE)[-1 , c(2,4,5)]

# Extract substrings in column 1 and keep first to third character vector.
dat2[,1] <- substr(dat2[,1],1,3)

# column rename 
colnames(dat2) <- c("Major","ID","Name")

2.2 Table

table(dat2[,1]) |> knitr::kable(col.names=c("Major","Number"))
Major Number
心理系 4
心理所 7
教育所 4

3 In-class 3

Data on body temperature, gender, and heart rate are taken from Mackowiak et al. (1992). “A Critical Appraisal of 98.6 Degrees F …,” in the Journal of the American Medical Association (268), 1578-80. Import the file. Find the correlation between body temperature and heart rate and investigate if there is a gender difference in mean temperature.

3.1 Input data

pacman::p_load(readxl, httr)
fL <- "https://mathcs.org/statistics/datasets/normtemp.xls"
GET(fL, write_disk(tf <- tempfile(fileext=".xls")))
## Response [https://mathcs.org/statistics/datasets/normtemp.xls]
##   Date: 2021-10-30 10:42
##   Status: 200
##   Content-Type: application/vnd.ms-excel
##   Size: 28.2 kB
## <ON DISK>  C:\Users\LIGHTM~1\AppData\Local\Temp\RtmpSSkyqc\file183c494b279f.xls
dat3 <- read_excel(tf, sheet=1)
#str(dat3)
dat3 |> as.data.frame()|> head()
##   Temp Sex Beats
## 1 96.3   1    70
## 2 96.7   1    71
## 3 96.9   1    74
## 4 97.0   1    80
## 5 97.1   1    73
## 6 97.1   1    75

3.2 Scatter plot with regression line

with(dat3, plot(Beats, Temp, xlab="Heart Rate", ylab="Temperature(°F)"))
grid()
abline(lm(Temp ~ Beats, data= dat3))

3.3 Boxplot

Temperature by gender

with(dat3, boxplot(Temp ~ as.factor(Sex), 
                   horizontal=F, 
                   frame=F,
                   col="aliceblue", 
                   varwidth=T,
                   xlab = "Gender",
                   ylab = "Temperature(°F)"))

3.4 Mean by Gender

Mean temperature and heart beats by gender
no difference (poor sense)??

show(mss <- aggregate(cbind(Temp, Beats) ~ Sex, data = dat3, FUN=mean))
##   Sex     Temp    Beats
## 1   1 98.10462 73.36923
## 2   2 98.39385 74.15385

3.6 Correlation

with(dat3, cor(Temp, Beats))
## [1] 0.2536564

3.7 T Test

t.test(Temp ~ as.factor(Sex), data=dat3)
## 
##  Welch Two Sample t-test
## 
## data:  Temp by as.factor(Sex)
## t = -2.2854, df = 127.51, p-value = 0.02394
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.53964856 -0.03881298
## sample estimates:
## mean in group 1 mean in group 2 
##        98.10462        98.39385
# female's temperature higher than male
t.test(Beats ~ as.factor(Sex), data=dat3)
## 
##  Welch Two Sample t-test
## 
## data:  Beats by as.factor(Sex)
## t = -0.63191, df = 116.7, p-value = 0.5287
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -3.243732  1.674501
## sample estimates:
## mean in group 1 mean in group 2 
##        73.36923        74.15385
# no difference in heart rate 

3.8 Regression coefficients

sapply(split(dat3, dat3$Sex), function(x) coef(lm(x$Temp ~ x$Beats))) |> knitr::kable()
1 2
(Intercept) 96.3978920 96.44212
x$Beats 0.0232621 0.02632

4 In-class 4

A classmate of yours used data.entry() to change the first woman’s height to 50 in the women{datasets}. She then closed the editor and issued plot(women). To her surprise, she got this message:
Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’

Explain what had happened. How would you plot the edited data file?

# load data 
data(women)

class(women)
## [1] "data.frame"
# head of data
women |> head()
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
# plot 
plot(women)

data.entry(women) 
# data.entry() A list of variables: currently these should be numeric or character vectors or list containing such vectors.
class(women)
## [1] "list"
plot(women[[1]], women[[2]])

5 In-class 5

The Ministry of Interior of Taiwan provides many datasets on its website. Download the excel file of Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate to examine the trend of the crude divorce rate over the years.

5.1 Input data

fL <- "https://www.ris.gov.tw/documents/data/en/3/History-Table-8-2020.xls"
GET(fL, write_disk(tf <- tempfile(fileext=".xls")))
## Response [https://www.ris.gov.tw/documents/data/en/3/History-Table-8-2020.xls]
##   Date: 2021-10-30 10:42
##   Status: 200
##   Content-Type: application/vnd.ms-excel
##   Size: 34.3 kB
## <ON DISK>  C:\Users\LIGHTM~1\AppData\Local\Temp\RtmpSSkyqc\file183cf105b36.xls
# select column: "Year" and "crude divorce rate"
dat5 <- read_excel(tf,col_names = F)[-c(1:4,51:56),c(1,5)] 
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
# column name
colnames(dat5) <- c("Year ","CrudeDivorceRate")

dat5 |> as.data.frame()|> head()
##   Year     CrudeDivorceRate
## 1  1975 0.46000000000000002
## 2  1976                 0.5
## 3  1977 0.56000000000000005
## 4  1978 0.64000000000000001
## 5  1979 0.72999999999999998
## 6  1980 0.77000000000000002

5.2 Plot

plot(dat5, xlab ="Year", ylab = "Crude Divorce Rate")

婚姻只是愛情的墳墓?