Q1

The AAUP2 data set is a comma-delimited fixed column format text file with ’*’ for missing value. Import the file into R and indicate missing values by ‘NA’. Hint: ?read.csv

# input data
# read.csv
dta <- read.csv("C:/Users/Ching-Fang Wu/Documents/dataM/aaup2.dat.txt", header=T)
head(dta)
##   X1061.Alaska.Pacific.University......AK.IIB..454.382.362.382..567.485.471..487...6..11...9...4...32
## 1  1063 Univ.Alaska-Fairbanks          AK I    686 560 432 508  914 753 572  677  74 125 118  40  404
## 2  1065 Univ.Alaska-Southeast          AK IIA  533 494 329 415  716 663 442  559   9  26  20   9   70
## 3 11462 Univ.Alaska-Anchorage          AK IIA  612 507 414 498  825 681 557  670 115 124 101  21  392
## 4  1002 Alabama Agri.&Mech. Univ.      AL IIA  442 369 310 350  530 444 376  423  59  77 102  24  262
## 5  1004 University of Montevallo       AL IIA  441 385 310 388  542 473 383  477  57  33  35   2  127
## 6  1008 Athens State College           AL IIB  466 394 351 396  558 476 427  478  20  18  30   0   68
tibble::glimpse(dta)
## Rows: 1,162
## Columns: 1
## $ X1061.Alaska.Pacific.University......AK.IIB..454.382.362.382..567.485.471..487...6..11...9...4...32 <chr> ~
readr::fwf_empty("C:/Users/Ching-Fang Wu/Documents/dataM/aaup2.dat.txt")
## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
## 
## $skip
## [1] 0
## 
## $col_names
##  [1] "X1"  "X2"  "X3"  "X4"  "X5"  "X6"  "X7"  "X8"  "X9"  "X10" "X11" "X12"
## [13] "X13" "X14" "X15" "X16"
dta <- readr::read_fwf("C:/Users/Ching-Fang Wu/Documents/dataM/aaup2.dat.txt", skip=1,
              readr::fwf_cols(X1=5,X2=30,X3=8,X4=5, X5=4,X6=4,X7=4,X8=5,X9=4,X10=4,X11=5,X12=4,X13=4,X14=4,X15=4,X16=4),na = '*') 
## 
## -- Column specification --------------------------------------------------------
## cols(
##   X1 = col_double(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_double(),
##   X9 = col_double(),
##   X10 = col_double(),
##   X11 = col_double(),
##   X12 = col_double(),
##   X13 = col_double(),
##   X14 = col_double(),
##   X15 = col_double(),
##   X16 = col_double()
## )

Q2

Here is a copy of the student roster in csv format from NCKU for a course I taught.

Dispaly the number of students from each major.

# input data
# read.csv
q2dta <- read.csv("C:/Users/Ching-Fang Wu/Documents/dataM/ncku_roster.csv", header=T)
head(q2dta)
##                     座號                                          系.年.班
## 1 教師:U3023  許清芳                                                      
## 2                      1 心理系           3                               
## 3                      2 心理系           3                               
## 4                      3 心理系           4                               
## 5                      4 心理系           4                               
## 6                      5 教育所           1 碩                            
##   開課系序號                                  學號
## 1            上課時間: 一[6-8];開課號:U3006  U7031
## 2      U7031                               D840239
## 3      U7031                               D840057
## 4      U7031                               D841311
## 5      U7031                               D840140
## 6      U3006                               U360098
##                                            姓名 成績              選課時間
## 1 科目:資料管理                                   NA                      
## 2                                            蘇   NA 02/17/2016 09:17:40  
## 3                                            吳   NA 02/17/2016 09:17:28  
## 4                                            余   NA 02/17/2016 09:09:10  
## 5                                            王   NA 02/17/2016 09:09:34  
## 6                                            劉   NA 01/18/2016 14:56:35
# 刪除第一列
q2dta<-q2dta[-1,]
# 只保留系所名稱
q2dta$系.年.班<-substr(q2dta[,2],1,3)
# 計算各系所學生人數
table(q2dta[[2]])
## 
## 心理系 心理所 教育所 
##      4      7      4

心理系 4位;心理所7個;教育所4個

Q3

Data on body temperature, gender, and heart rate. are taken from Mackowiak et al. (1992). “A Critical Appraisal of 98.6 Degrees F …,” in the Journal of the American Medical Association (268), 1578-80. Import the file.

Find the correlation between body temperature and heart rate and investigate if there is a gender difference in mean temperature.

# install tools
#install.packages("pacman")
library(pacman)
pacman::p_load(readxl, httr)
# Import excel files
q3dta <- read_excel("C:/Users/Ching-Fang Wu/Documents/dataM/normtemp.xls")
q3dta
## # A tibble: 130 x 3
##     Temp   Sex Beats
##    <dbl> <dbl> <dbl>
##  1  96.3     1    70
##  2  96.7     1    71
##  3  96.9     1    74
##  4  97       1    80
##  5  97.1     1    73
##  6  97.1     1    75
##  7  97.1     1    82
##  8  97.2     1    64
##  9  97.3     1    69
## 10  97.4     1    70
## # ... with 120 more rows
#Find the correlation between body temperature and heart rate
cor(q3dta) 
##            Temp        Sex      Beats
## Temp  1.0000000 0.19800622 0.25365640
## Sex   0.1980062 1.00000000 0.05576622
## Beats 0.2536564 0.05576622 1.00000000
# overall summary
summary(q3dta)
##       Temp             Sex          Beats      
##  Min.   : 96.30   Min.   :1.0   Min.   :57.00  
##  1st Qu.: 97.80   1st Qu.:1.0   1st Qu.:69.00  
##  Median : 98.30   Median :1.5   Median :74.00  
##  Mean   : 98.25   Mean   :1.5   Mean   :73.76  
##  3rd Qu.: 98.70   3rd Qu.:2.0   3rd Qu.:79.00  
##  Max.   :100.80   Max.   :2.0   Max.   :89.00
#investigate if there is a gender difference in mean temperature
with(q3dta, boxplot(Temp ~ Sex, 
                  horizontal=F, 
                  frame=F,
                  col="aliceblue", 
                  varwidth=T))

# 獨立樣本t檢定
t.test(Temp ~ Sex,data = q3dta)
## 
##  Welch Two Sample t-test
## 
## data:  Temp by Sex
## t = -2.2854, df = 127.51, p-value = 0.02394
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  -0.53964856 -0.03881298
## sample estimates:
## mean in group 1 mean in group 2 
##        98.10462        98.39385

p-value = 0.02394 < 0.05, Reject alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0

Q4

A classmate of yours used data.entry() to change the first woman’s height to 50 in the women{datasets}. She then closed the editor and issued plot(women). To her surprise, she got this message: Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’

Explain what had happened. How would you plot the edited data file?

data(women)
plot(women)

#copy data
lady<-women
#data.entry()
data.entry(data=lady) 
#plot(lady)
#Error in xy.coords(x, y, xlabel, ylabel, log) : 
#'x' is a list, but does not have components 'x' and 'y'

將第一位女性的身高改為50,關掉視窗後出現錯誤訊息,這個訊息的意思是lady的資料結構為list。

plot(lady$height,lady$weight)

Q5

The Ministry of Interior of Taiwan provides many datasets on its website. Download the excel file of Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate to examine the trend of the crude divorce rate over the years.

# input data
q5dta <- read_excel("C:/Users/Ching-Fang Wu/Documents/dataM/Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate.xls")
## New names:
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
head(q5dta)
## # A tibble: 6 x 5
##   `Table 8. Couples of Marriag~ ...2        ...3        ...4          ...5      
##   <chr>                         <chr>       <chr>       <chr>         <chr>     
## 1  <NA>                          <NA>        <NA>        <NA>         "Unit : C~
## 2 "年別\nYear"                  "結婚對數\~ "粗結婚率\~ "離婚/終止結~ "粗離婚率~
## 3  <NA>                          <NA>        <NA>        <NA>          <NA>     
## 4 "1975"                        "149958"    "9.3300000~ "7387"        "0.460000~
## 5 "1976"                        "152240"    "9.2799999~ "8155"        "0.5"     
## 6 "1977"                        "156616"    "9.3599999~ "9259"        "0.560000~
#examine data structure
str(q5dta)
## tibble [55 x 5] (S3: tbl_df/tbl/data.frame)
##  $ Table 8. Couples of Marriages, Divorces, Crude Marriage Rate and Crude Divorce Rate: chr [1:55] NA "年別\nYear" NA "1975" ...
##  $ ...2                                                                               : chr [1:55] NA "結婚對數\nCouples of\nMarriages" NA "149958" ...
##  $ ...3                                                                               : chr [1:55] NA "粗結婚率\nCrude\nMarriage  Rate" NA "9.3300000000000001" ...
##  $ ...4                                                                               : chr [1:55] NA "離婚/終止結婚\nDivorce/\nTerminated Marriage" NA "7387" ...
##  $ ...5                                                                               : chr [1:55] "Unit : Couple ; <U+2030>" "粗離婚率\nCrude\nDivorce Rate" NA "0.46000000000000002" ...
# 擷取需要的變數
new_q5dta<-q5dta[,c(1,5)]
# assign names to column vars
colnames(new_q5dta) <- c("Year", "Crude Divorce Rate"); new_q5dta
## # A tibble: 55 x 2
##    Year         `Crude Divorce Rate`           
##    <chr>        <chr>                          
##  1  <NA>        "Unit : Couple ; <U+2030>"            
##  2 "年別\nYear" "粗離婚率\nCrude\nDivorce Rate"
##  3  <NA>         <NA>                          
##  4 "1975"       "0.46000000000000002"          
##  5 "1976"       "0.5"                          
##  6 "1977"       "0.56000000000000005"          
##  7 "1978"       "0.64000000000000001"          
##  8 "1979"       "0.72999999999999998"          
##  9 "1980"       "0.77000000000000002"          
## 10 "1981"       "0.82999999999999996"          
## # ... with 45 more rows
# 
plot(new_q5dta,
     xlab = "Year",
     ylab = "Crude Divorce Rate",
     main = "Crude Divorce Rate Yearly",
     xlim = c(1975,2021),
     ylim = c(0.1, 4),
     lty=2,lwd=2,pch='*',col=4)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA

## Warning in xy.coords(x, y, xlabel, ylabel, log): 強制變更過程中產生了 NA

##