Exercises:
1.The number of complaints received for each of 44 doctors working in the emergency room of a large hospital in a year was recorded. Does the distribution of number of complaints follow the Poisson distribution with mean 3? Use this R script to answer the question.
# upper bound of the count value
n <- 13;
# mean parameter of a Poisson distribution
lambda <- 3;
# upper bound of y-axis for plotting
ylim <- 0.25
# set up a plotting frame
plot(c(0, n), c(0, ylim), type ="n", xlab ="Count", ylab="Probability")
# do vertical lines
segments(0:n, rep(0, n), 0:n, dpois(0:n, lambda))
# plot title
title(paste("Poisson distribution, ", "mean = ", lambda))
# predicted complaints
df <- data.frame(count=0:7, predicted=dpois(0:7, lambda)*41)
#df$prb <- df$predicted/sum(df$predicted)
Source: Le, C.T. (2010). Applied categorical data analysis and translational research. P. 203.
Column 1: Number of complaints made
*Column 2: Number of doctors**
library(lattice)
dta <- read.csv(file="http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/ERComplaints.txt",h=T,sep="")
with(dta, qqmath(numberOfDr,
prepanel = prepanel.qqmathline,
panel = function(x, ...) {
panel.qqmathline(x, y=x,
distribution= function(p) qpois(p, 3), col="blue")
panel.qqmath(x, ...) }))
# a little bit strange
solution 2
library(fitdistrplus)
## Loading required package: MASS
#hist(dta$numberOfDr)
fit <- fitdist(dta$numberOfDr, "pois")
plot(fit)
fit$aic;fit$bic;fit$estimate
## [1] 54.29873
## [1] 54.37818
## lambda
## 5.5
2. A group of students have taken exams in Math, Science, and English. These scores are to be combined into a single grade for each student. An ‘A’ is assigned to the top 20 percent of students, a ‘B’ to the next 20 percent, and so on. The final results are sorted alphabetically by the last names of the students. Use the following R script to complete the task described above.
pupil <- c("John Denver", "Amy Tan", "Bill Clinton", "Dave Kuhn",
"Jane Mayer", "Kevin Costner", "Ruben Yin", "Greg Knox", "Joel English",
"Meg Ryan")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(pupil, Math, Science, English, stringsAsFactors=FALSE)
fullname <- strsplit((roster$pupil), " ")
#str(fullname)
firstname <- sapply(fullname, "[", 1)
lastname <- sapply(fullname, "[", 2)
roster <- data.frame(firstname, lastname, roster[ , -1])
roster1 <- roster
roster1[,3:5] <- scale(roster1[,3:5])
roster1$grade <- roster1$Math+roster1$Science+roster1$English
roster1$rank <- cut(roster1$grade,5,labels = toupper(letters[5:1]))
roster$rank <- roster1$rank
roster[order(roster$lastname),]
## firstname lastname Math Science English rank
## 3 Bill Clinton 412 80 18 E
## 6 Kevin Costner 512 85 28 B
## 1 John Denver 502 95 25 B
## 9 Joel English 573 89 27 B
## 8 Greg Knox 625 95 30 A
## 4 Dave Kuhn 358 82 15 E
## 5 Jane Mayer 495 75 20 D
## 10 Meg Ryan 522 86 18 D
## 2 Amy Tan 600 99 22 A
## 7 Ruben Yin 410 80 15 E
3.School administrators collected attendance data on 316 high school juniors from two urban high schools. The variables of interest are the number of days absent from school, gender of the student, and a standardized test score in mathematics.
dta <- read.csv(file="http://titan.ccunix.ccu.edu.tw/~psycfs/dataM/Data/attendance.txt",h=T,sep="")
#head(dta);str(dta)
Recode the school IDs to ‘S01’ and ‘S02’.
library(plyr);library(dplyr);library(ggplot2)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dta$school <- revalue(dta$school <- as.factor(dta$school), c("1"="S01", "2"="S02"))
Recode the gender ID to ‘Female’ and ‘Male’.
dta$male <- revalue(dta$male <- as.factor(dta$male), c("1" ="Male", "0" ="Female"))
dta <- rename(dta,gender=male)
Create index for students within each school.
# I could not catch the meaning of the question.
Find the mean math score of each school.
aggregate(dta$math, by=list(dta$school), FUN=mean, na.rm=TRUE)
## Group.1 x
## 1 S01 42.20373
## 2 S02 55.38172
Find the mean math score of each school by gender.
aggregate(dta$math, by=list(dta$school,dta$gender), FUN=mean, na.rm=TRUE)
## Group.1 Group.2 x
## 1 S01 Female 42.01706
## 2 S02 Female 55.97780
## 3 S01 Male 42.36218
## 4 S02 Male 54.60155
Find the mean number of days of absence of each school.
mean(dta$daysabs)
## [1] 5.810127
Find the mean number of days of absence of each school by gender.
aggregate(dta$daysabs, by=list(dta$gender), FUN=mean, na.rm=TRUE)
## Group.1 x
## 1 Female 6.697531
## 2 Male 4.876623
Is the number of days of absence related to either gender or math score or both? Does the same relationship hold for both schools?
ggplot(dta,aes(y=daysabs,x=math,col=gender))+
geom_point()+
geom_smooth(method = "lm")+
facet_grid(.~school)
summary(lm(daysabs~math+gender+school+gender*school,data=dta))
##
## Call:
## lm(formula = daysabs ~ math + gender + school + gender * school,
## data = dta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.624 -3.568 -1.911 1.997 37.180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.35579 1.28053 8.868 < 2e-16 ***
## math -0.02216 0.02355 -0.941 0.347477
## genderMale -4.23096 1.10505 -3.829 0.000156 ***
## schoolS02 -6.47483 1.14468 -5.656 3.5e-08 ***
## genderMale:schoolS02 3.78060 1.57273 2.404 0.016808 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.944 on 311 degrees of freedom
## Multiple R-squared: 0.1421, Adjusted R-squared: 0.1311
## F-statistic: 12.88 on 4 and 311 DF, p-value: 1.025e-09
summary(lm(daysabs~math+gender,data=subset(dta,dta$school=="S01")))
##
## Call:
## lm(formula = daysabs ~ math + gender, data = subset(dta, dta$school ==
## "S01"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.759 -5.602 -2.375 2.629 36.695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.98752 1.79502 6.678 4.02e-10 ***
## math -0.03720 0.03594 -1.035 0.30234
## genderMale -4.22577 1.31925 -3.203 0.00165 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.289 on 156 degrees of freedom
## Multiple R-squared: 0.06807, Adjusted R-squared: 0.05612
## F-statistic: 5.697 on 2 and 156 DF, p-value: 0.004092
summary(lm(daysabs~math+gender,data=subset(dta,dta$school=="S02")))
##
## Call:
## lm(formula = daysabs ~ math + gender, data = subset(dta, dta$school ==
## "S02"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.707 -3.206 -1.657 0.794 37.372
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.553860 1.694079 2.098 0.0376 *
## math 0.001547 0.028584 0.054 0.9569
## genderMale -0.417732 0.846452 -0.494 0.6224
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.25 on 154 degrees of freedom
## Multiple R-squared: 0.001618, Adjusted R-squared: -0.01135
## F-statistic: 0.1248 on 2 and 154 DF, p-value: 0.8828
4.The following R script converts Taiwan dollars into US dollars. Modify it so that it converts US dollars to Euros.
US2EU <- function(data_US, exrate)
return(paste("$", format(round(data_US*exrate, 2)),sep=""))
exrate <- 0.889209
data_US <- round(rnorm(6, mean=3000, sd=300))
data.frame(USD=data_US, Euros=US2EU(data_US, exrate))
## USD Euros
## 1 2780 $2472.00
## 2 3564 $3169.14
## 3 3000 $2667.63
## 4 2416 $2148.33
## 5 2730 $2427.54
## 6 2731 $2428.43
8.Use the data in the high schools example and write your own functions to solve the following problems:
(a) test if any pairs of the five variables: read , write , math science , and socst , are different in means.
dta <- read.table("data/hs0.txt",h=T)
head(dta)
## id female race ses schtyp prog read write math science socst
## 1 70 male white low public general 57 52 41 47 57
## 2 121 female white middle public vocation 68 59 53 63 61
## 3 86 male white high public general 44 33 54 58 31
## 4 141 male white high public vocation 63 44 47 53 56
## 5 172 male white middle public academic 47 52 57 53 61
## 6 113 male white middle public academic 44 52 51 63 61
pairs_meandiff <- function(var1,var2){
ans <- t.test(var1,var2)$p.value
print(ans) # for proving
return(ifelse(ans>0.05,"Same","Different"))
}
pairs_meandiff(dta$read,dta$read)
## [1] 1
## [1] "Same"
(b) test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read , write , math science , and socst .
factor_meandiff <- function(var1,var2){
x <- summary(lm(var2~var1))
ans <- pf(x$fstatistic[1], x$fstatistic[2], x$fstatistic[3],
lower.tail = FALSE)
results <- ifelse(as.numeric(ans)>0.05,"There are all same mean scores","There are some different scores")
#print(x)
#print(ans) # for checking
return(results)
}
factor_meandiff(dta$race,dta$read)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$write)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$math)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$science)
## [1] "There are some different scores"
factor_meandiff(dta$race,dta$socst)
## [1] "There are some different scores"