Exercises:

1.The number of complaints received for each of 44 doctors working in the emergency room of a large hospital in a year was recorded. Does the distribution of number of complaints follow the Poisson distribution with mean 3? Use this R script to answer the question.

# upper bound of the count value
n <- 13;
# mean parameter of a Poisson distribution
lambda <- 3;
# upper bound of y-axis for plotting
ylim <- 0.25
# set up a plotting frame
plot(c(0, n), c(0, ylim), type ="n", xlab ="Count", ylab="Probability")
# do vertical lines
segments(0:n, rep(0, n), 0:n, dpois(0:n, lambda))
# plot title 
title(paste("Poisson distribution, ", "mean = ", lambda))

# predicted complaints
df <- data.frame(count=0:7, predicted=dpois(0:7, lambda)*41)
#df$prb <- df$predicted/sum(df$predicted)

Source: Le, C.T. (2010). Applied categorical data analysis and translational research. P. 203.
Column 1: Number of complaints made
*Column 2: Number of doctors**

library(lattice)
dta <- read.csv(file="http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/ERComplaints.txt",h=T,sep="")
with(dta, qqmath(numberOfDr,
                 prepanel = prepanel.qqmathline,
                 panel = function(x, ...) {
                         panel.qqmathline(x, y=x,
                                          distribution= function(p) qpois(p, 3), col="blue")
                         panel.qqmath(x, ...) }))

# a little bit strange

solution 2

library(fitdistrplus)
## Loading required package: MASS
#hist(dta$numberOfDr)
fit <- fitdist(dta$numberOfDr, "pois")
plot(fit)

fit$aic;fit$bic;fit$estimate
## [1] 54.29873
## [1] 54.37818
## lambda 
##    5.5

2. A group of students have taken exams in Math, Science, and English. These scores are to be combined into a single grade for each student. An ‘A’ is assigned to the top 20 percent of students, a ‘B’ to the next 20 percent, and so on. The final results are sorted alphabetically by the last names of the students. Use the following R script to complete the task described above.

pupil <- c("John Denver", "Amy Tan", "Bill Clinton", "Dave Kuhn", 
"Jane Mayer", "Kevin Costner", "Ruben Yin", "Greg Knox", "Joel English",
"Meg Ryan")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(pupil, Math, Science, English, stringsAsFactors=FALSE)
fullname <- strsplit((roster$pupil), " ")
#str(fullname)
firstname <- sapply(fullname, "[", 1)
lastname <- sapply(fullname, "[", 2)
roster <- data.frame(firstname, lastname, roster[ , -1])
roster1 <- roster
roster1[,3:5] <- scale(roster1[,3:5])
roster1$grade <- roster1$Math+roster1$Science+roster1$English
roster1$rank <- cut(roster1$grade,5,labels = toupper(letters[5:1]))
roster$rank <- roster1$rank
roster[order(roster$lastname),]
##    firstname lastname Math Science English rank
## 3       Bill  Clinton  412      80      18    E
## 6      Kevin  Costner  512      85      28    B
## 1       John   Denver  502      95      25    B
## 9       Joel  English  573      89      27    B
## 8       Greg     Knox  625      95      30    A
## 4       Dave     Kuhn  358      82      15    E
## 5       Jane    Mayer  495      75      20    D
## 10       Meg     Ryan  522      86      18    D
## 2        Amy      Tan  600      99      22    A
## 7      Ruben      Yin  410      80      15    E

3.School administrators collected attendance data on 316 high school juniors from two urban high schools. The variables of interest are the number of days absent from school, gender of the student, and a standardized test score in mathematics.

dta <- read.csv(file="http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/attendance.txt",h=T,sep="")
#head(dta);str(dta)

Recode the school IDs to ‘S01’ and ‘S02’.

library(plyr);library(dplyr);library(ggplot2)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dta$school <- revalue(dta$school <- as.factor(dta$school), c("1"="S01", "2"="S02"))

Recode the gender ID to ‘Female’ and ‘Male’.

dta$male <- revalue(dta$male <- as.factor(dta$male), c("1" ="Male", "0" ="Female"))
dta <- rename(dta,gender=male)

Create index for students within each school.

# I could not catch the meaning of the question.

Find the mean math score of each school.

aggregate(dta$math, by=list(dta$school), FUN=mean, na.rm=TRUE)
##   Group.1        x
## 1     S01 42.20373
## 2     S02 55.38172

Find the mean math score of each school by gender.

aggregate(dta$math, by=list(dta$school,dta$gender), FUN=mean, na.rm=TRUE)
##   Group.1 Group.2        x
## 1     S01  Female 42.01706
## 2     S02  Female 55.97780
## 3     S01    Male 42.36218
## 4     S02    Male 54.60155

Find the mean number of days of absence of each school.

mean(dta$daysabs)
## [1] 5.810127

Find the mean number of days of absence of each school by gender.

aggregate(dta$daysabs, by=list(dta$gender), FUN=mean, na.rm=TRUE)
##   Group.1        x
## 1  Female 6.697531
## 2    Male 4.876623

Is the number of days of absence related to either gender or math score or both? Does the same relationship hold for both schools?

ggplot(dta,aes(y=daysabs,x=math,col=gender))+
        geom_point()+
        geom_smooth(method = "lm")+
        facet_grid(.~school)

summary(lm(daysabs~math+gender+school+gender*school,data=dta))
## 
## Call:
## lm(formula = daysabs ~ math + gender + school + gender * school, 
##     data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.624  -3.568  -1.911   1.997  37.180 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          11.35579    1.28053   8.868  < 2e-16 ***
## math                 -0.02216    0.02355  -0.941 0.347477    
## genderMale           -4.23096    1.10505  -3.829 0.000156 ***
## schoolS02            -6.47483    1.14468  -5.656  3.5e-08 ***
## genderMale:schoolS02  3.78060    1.57273   2.404 0.016808 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.944 on 311 degrees of freedom
## Multiple R-squared:  0.1421, Adjusted R-squared:  0.1311 
## F-statistic: 12.88 on 4 and 311 DF,  p-value: 1.025e-09
summary(lm(daysabs~math+gender,data=subset(dta,dta$school=="S01")))
## 
## Call:
## lm(formula = daysabs ~ math + gender, data = subset(dta, dta$school == 
##     "S01"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.759  -5.602  -2.375   2.629  36.695 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.98752    1.79502   6.678 4.02e-10 ***
## math        -0.03720    0.03594  -1.035  0.30234    
## genderMale  -4.22577    1.31925  -3.203  0.00165 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.289 on 156 degrees of freedom
## Multiple R-squared:  0.06807,    Adjusted R-squared:  0.05612 
## F-statistic: 5.697 on 2 and 156 DF,  p-value: 0.004092
summary(lm(daysabs~math+gender,data=subset(dta,dta$school=="S02")))
## 
## Call:
## lm(formula = daysabs ~ math + gender, data = subset(dta, dta$school == 
##     "S02"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.707 -3.206 -1.657  0.794 37.372 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.553860   1.694079   2.098   0.0376 *
## math         0.001547   0.028584   0.054   0.9569  
## genderMale  -0.417732   0.846452  -0.494   0.6224  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.25 on 154 degrees of freedom
## Multiple R-squared:  0.001618,   Adjusted R-squared:  -0.01135 
## F-statistic: 0.1248 on 2 and 154 DF,  p-value: 0.8828

4.The following R script converts Taiwan dollars into US dollars. Modify it so that it converts US dollars to Euros.

US2EU <- function(data_US, exrate)
  return(paste("$", format(round(data_US*exrate, 2)),sep=""))
exrate <- 0.889209
data_US <- round(rnorm(6, mean=3000, sd=300))
data.frame(USD=data_US, Euros=US2EU(data_US, exrate))
##    USD    Euros
## 1 2780 $2472.00
## 2 3564 $3169.14
## 3 3000 $2667.63
## 4 2416 $2148.33
## 5 2730 $2427.54
## 6 2731 $2428.43

8.Use the data in the high schools example and write your own functions to solve the following problems:

(a) test if any pairs of the five variables: read , write , math science , and socst , are different in means.

dta <- read.table("data/hs0.txt",h=T)
head(dta)
##    id female  race    ses schtyp     prog read write math science socst
## 1  70   male white    low public  general   57    52   41      47    57
## 2 121 female white middle public vocation   68    59   53      63    61
## 3  86   male white   high public  general   44    33   54      58    31
## 4 141   male white   high public vocation   63    44   47      53    56
## 5 172   male white middle public academic   47    52   57      53    61
## 6 113   male white middle public academic   44    52   51      63    61
pairs_meandiff <- function(var1,var2){
        ans <- t.test(var1,var2)$p.value
        print(ans)  # for proving
        return(ifelse(ans>0.05,"Same","Different"))
}
pairs_meandiff(dta$read,dta$read)
## [1] 1
## [1] "Same"

(b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read , write , math science , and socst .

factor_meandiff <- function(var1,var2){
        x <- summary(lm(var2~var1))
        ans <- pf(x$fstatistic[1], x$fstatistic[2], x$fstatistic[3],
     lower.tail = FALSE)
        results <- ifelse(as.numeric(ans)>0.05,"There are all same mean scores","There are some different scores")
        #print(x)
        #print(ans) # for checking
        return(results)
}

factor_meandiff(dta$race,dta$read)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$write)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$math)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$science)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$socst)
## [1] "There are some different scores"