In-class exercise

Q1.

The following student ID file is missing an ID U76067010 at the third to the last position. Find a way to fix it? Dowload and display the data contents in R.

dta <- read.table("student2017.txt",header = T)
# >fix(dta)
# added factor levels in 'ID'
write.table(dta, file="student2017_modified.txt", quote=F, col.names=T, row.names=F)
dta <- read.table("student2017_modified.txt",header = T)
dta

Q2.

A classmate of yours used data.entry() to change the first woman’s height to 50 in the women{datasets}. She then closed the editor and issued plot(women). To her surprise, she got this message:

Error in xy.coords(x, y, xlabel, ylabel, log) : ‘x’ is a list, but does not have components ‘x’ and ‘y’

Explain what had happened. How would you plot the edited data file?

> data.entry(women)
> women
$height
 [1] 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

$weight
 [1] 115 117 120 123 126 129 132 135 139 142 146 150 154 159 164
> plot(women)
Error in xy.coords(x, y, xlabel, ylabel, log) :
  'x' is a list, but does not have components 'x' and 'y'
> plot(women$height,women$weight) # work!

需要確切給他x軸y軸的變項。

Q3.

Make a Google citations plot (for the last 5 years) of one of NCKU faculty members you know.

pacman::p_load(scholar)
dta <- get_citation_history('fCNw-4kAAAAJ') #Professor Chung-Ping Cheng
plot(dta, xlab="Year", ylab="Citations", type='h', lwd=2, xaxt="n", xlim = c(2014, 2019))
axis(side=1, at=seq(2014, 2019, by=1), cex.axis=0.7)
abline(h=seq(0, 400, by=200), lty=3, col="gray")

Q4.

Data on body temperature, gender, and heart rate. are taken from Mackowiak et al. (1992). “A Critical Appraisal of 98.6 Degrees F …,” in the Journal of the American Medical Association (268), 1578-80. Import the file. Find the correlation between body temperature and heart rate and investigate if there is a gender difference in mean temperature.

library("readxl")
dta <- read_excel("normtemp.xls")
dta$Sex <- factor(dta$Sex)
cor(dta$Temp,dta$Beats)
## [1] 0.2536564
# regress math2 by math1
dta.lm <- lm(Temp ~ Sex, data=dta)

# show results
summary(dta.lm)
## 
## Call:
## lm(formula = Temp ~ Sex, data = dta)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99385 -0.47154  0.00615  0.40615  2.40615 
## 
## Coefficients:
##             Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 98.10462    0.08949 1096.298   <2e-16 ***
## Sex2         0.28923    0.12655    2.285   0.0239 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7215 on 128 degrees of freedom
## Multiple R-squared:  0.03921,    Adjusted R-squared:  0.0317 
## F-statistic: 5.223 on 1 and 128 DF,  p-value: 0.02393
# show anova table
anova(dta.lm)

there is a gender difference in mean temperature.

Q5.

The AAUP2 data set is a comma-delimited fixed column format text file with ’*’ for missing value. Import the file into R and indicate missing values by ‘NA’. Hint: ?read.csv

readr::fwf_empty("aaup2.txt")[1:2]
## $begin
##  [1]  0  6 40 45 49 53 57 61 66 70 74 79 83 87 92 95
## 
## $end
##  [1]  5 39 43 48 52 56 60 65 69 73 78 82 86 90 94 NA
dta <- readr::read_fwf("aaup2.txt", skip=1, na = c("*"),
              readr::fwf_cols(x1=6,x2=34,x3=5,x4=4,x5=4,x6=4,x7=4,x8=5,x9=4,x10=4,x11=5,x12=4,x13=4,x14=5,x15=3, Rape=15))
## Warning: 1160 parsing failures.
## row  col expected actual        file
##   1 Rape 15 chars      4 'aaup2.txt'
##   2 Rape 15 chars      4 'aaup2.txt'
##   3 Rape 15 chars      4 'aaup2.txt'
##   4 Rape 15 chars      4 'aaup2.txt'
##   5 Rape 15 chars      4 'aaup2.txt'
## ... .... ........ ...... ...........
## See problems(...) for more details.
dta

Exercises

Q1.

Here is a copy of the student roster in csv format from NCKU for a course I taught. Dispaly the number of students from each major.

dta <- read.csv("ncku_roster.csv",header = T,fileEncoding='big5')
dta <- dta[-1,]
dta
dta$major <- factor(substr(dta$系.年.班,1,2))
aggregate(系.年.班 ~ major, data=dta, FUN=length)

Q2.

Chatterjee and Hadi (Regression by Examples, 2006) provided a link to the right to work data set on their web page. Display the relationship between Income and Taxes.

readr::fwf_empty("P005.txt")[1:2]
## $begin
## [1] 0
## 
## $end
## [1] NA
dta <- read.table("P005.txt",header=T, sep="\t", fill=TRUE)
# fill :  If TRUE then in case the rows have unequal length, blank fields are implicitly added.
cor(dta$Income, dta$Taxes)
## [1] 0.0560718
plot(dta$Income,dta$Taxes, xlab = 'Income',ylab='Taxes', pch=20)

Q3.

Download the data file in junior school project and read it into your currect R session. Assign the data set to a data frame object called jsp. 1. Re-name the variable ‘sex’ as ‘Gender’. 2. Re-label the values of the social class variable using the (long character strings) descriptive terms to produce the following plot.

jsp <- read.table("juniorSchools.txt", header = T)
names(jsp)[3] <- "Gender"
jsp$soc <- factor(jsp$soc, labels = c('I','II','II_0man','III_man','IV','V','VI_Unemp_L','VII_emp_NC','VII_Miss_Dad'))
plot(jsp$soc, jsp$math, xlab='SOC', ylab='math')

3. Save the edited jsp data object out as a comma-separated-value file and in a R data format to a data folder and read them back into your R session, separately.

write.table(jsp, file="jsp.txt", sep=",", quote=F, col.names=T, row.names=F)
jsp_tmp <- read.table("jsp.txt", header = T, sep = ',')
jsp_tmp

Q4.

The following zip file contains one subject’s laser-event potentials (LEP) data for 4 separate conditions (different level of stimulus intensity), each in a plain text file (1w.dat, 2w.dat, 3w.dat and 4w.dat). The rows are time points from -100 to 800 ms sampled at 2 ms per record. The columns are channel IDs. Input all the files into R for graphical exploration.

unzip('Subject1.zip', exdir="tmp_data")
library(data.table)
w1 <- data.table::fread("tmp_data/Subject1/1w.dat")
w2 <- data.table::fread("tmp_data/Subject1/2w.dat")
w3 <- data.table::fread("tmp_data/Subject1/3w.dat")
w4 <- data.table::fread("tmp_data/Subject1/4w.dat")
timePoints <- seq(-100,800,2)
library(tidyr)
F7 <- data.frame(cbind(timePoints,w1[,1],w2[,1],w3[,1],w4[,1]))
names(F7) <- c("time","w1", "w2", "w3", "w4")
F7_long <- gather(F7, condition, LEP, w1:w4, factor_key=TRUE)
library(ggplot2)
ggplot(data = F7_long, mapping = aes(x = time, y = LEP, color = condition)) +
  geom_line()

w1_long <- gather(data.frame(cbind(timePoints,w1[,-31])),condition, LEP, 2:31, factor_key=TRUE)
ggplot(data = w1_long, mapping = aes(x = timePoints, y = LEP, color = condition)) +
  geom_line()

Q5.

The ASCII (plain text) file schiz.asc contains response times (in milliseconds) for 11 non-schizophrenics and 6 schizophrenics (30 measurements for each person). Summarize and compare descriptive statistics of the measurements from the two groups.

dta <- read.table('schiz.asc.txt', skip=5, header = F) 
# skip : the number of lines of the data file to skip before beginning to read data.
# dta <- scan('schiz.asc.txt',skip=5) #use scan
dta$class <- factor(c(rep("non-schizo", 11),rep("schizo", 6)))
dta
aggregate(. ~ class, mean, data=dta)
aggregate(. ~ class, sd, data=dta)
# library(reshape2)
tmp_long <- gather(dta, condition, ms, V1:V30, factor_key=TRUE)
# tmp_wide <- dcast(tmp_long, condition ~ class, value.var="mean")
ggplot(data = tmp_long, mapping = aes(x = condition, y = ms, color = class)) +
  geom_point() +
  stat_summary(aes(y = ms ,group = class), fun.y=mean, geom="line")
## Warning: `fun.y` is deprecated. Use `fun` instead.

t.test(tmp_long$ms~ tmp_long$class)
## 
##  Welch Two Sample t-test
## 
## data:  tmp_long$ms by tmp_long$class
## t = -9.8771, df = 190.98, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -235.9773 -157.4166
## sample estimates:
## mean in group non-schizo     mean in group schizo 
##                 310.1697                 506.8667