Exercise:Functions

1.The number of complaints received for each of 44 doctors working in the emergency room of a large hospital in a year was recorded. Does the distribution of number of complaints follow the Poisson distribution with mean 3? Use this R script to answer the question.

dat1 <- read.table("data/ERComplaints.txt",header=T)
# script for Poisson distribution
# upper bound of the count value
n <- 13
# mean parameter of a Poisson distribution
lambda <- 3
# upper bound of y-axis for plotting
ylim <- 0.25
# set up a plotting frame
plot(c(0,n), c(0, ylim), type ="n", xlab ="Count", ylab="Probability")
# do vertical lines
segments(0:n, rep(0, n), 0:n, dpois(0:n, lambda))
# plot title 
title(paste("Poisson distribution, ", "mean = ", lambda))
# predicted complaints
data.frame(count=0:7, predicted=dpois(0:7, lambda)*41)

  count predicted
1     0 2.0412698
2     1 6.1238094
3     2 9.1857141
4     3 9.1857141
5     4 6.8892856
6     5 4.1335714
7     6 2.0667857
8     7 0.8857653

library(lattice)

with(dat1, qqmath(numberOfDr,pch=15,col="darkgreen",
prepanel = prepanel.qqmathline,
panel = function(x, ...) {
panel.qqmathline(x, y=x,
distribution=function(p) qpois(p,3), col="red")
panel.qqmath(x, ...) }))

3.School administrators collected attendance data on 316 high school juniors from two urban high schools. The variables of interest are the number of days absent from school, gender of the student, and a standardized test score in mathematics.
1.Recode the school IDs to ‘S01’ and ‘S02’.

dta <- read.table("data/attendance.txt",header=T)
dta$school <- as.factor(dta$school)
levels(dta$school) <- c("S01","S02")
str(dta)

'data.frame':   316 obs. of  4 variables:
 $ school : Factor w/ 2 levels "S01","S02": 1 1 1 1 1 1 1 1 1 1 ...
 $ male   : int  1 1 0 0 0 0 0 1 1 1 ...
 $ math   : num  56.99 37.09 32.28 29.06 6.75 ...
 $ daysabs: int  4 4 2 3 3 13 11 7 10 9 ...

2.Recode the gender ID to ‘Female’ and ‘Male’.

dta$male <- as.factor(dta$male)
levels(dta$male) <- c("Female","Male")
str(dta)

'data.frame':   316 obs. of  4 variables:
 $ school : Factor w/ 2 levels "S01","S02": 1 1 1 1 1 1 1 1 1 1 ...
 $ male   : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 2 2 2 ...
 $ math   : num  56.99 37.09 32.28 29.06 6.75 ...
 $ daysabs: int  4 4 2 3 3 13 11 7 10 9 ...

3.Create index for students within each school.

###

4.Find the mean math score of each school.

aggregate(dta$math, by=list(dta$school), FUN=mean)

  Group.1        x
1     S01 42.20373
2     S02 55.38172

5.Find the mean math score of each school by gender.

aggregate(dta$math, by=list(dta$male), FUN=mean)

  Group.1        x
1  Female 49.68685
2    Male 47.76658

6.Find the mean number of days of absence of each school.

aggregate(dta$daysabs, by=list(dta$school), FUN=mean)

  Group.1        x
1     S01 8.132075
2     S02 3.458599

7.Find the mean number of days of absence of each school by gender.

aggregate(dta$daysabs, by=list(dta$school,dta$male), FUN=mean)

  Group.1 Group.2         x
1     S01  Female 10.424658
2     S02  Female  3.640449
3     S01    Male  6.186047
4     S02    Male  3.220588

8.Is the number of days of absence related to either gender or math score or both? Does the same relationship hold for both schools?

abs1 <- lm(daysabs~male,data=dta)
summary(abs1)


Call:
lm(formula = daysabs ~ male, data = dta)

Residuals:
   Min     1Q Median     3Q    Max 
-6.698 -4.877 -2.698  2.123 38.302 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   6.6975     0.5818  11.512   <2e-16 ***
maleMale     -1.8209     0.8334  -2.185   0.0296 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.405 on 314 degrees of freedom
Multiple R-squared:  0.01498,   Adjusted R-squared:  0.01184 
F-statistic: 4.774 on 1 and 314 DF,  p-value: 0.02963

abs2 <- lm(daysabs~math,data=dta)
summary(abs2)


Call:
lm(formula = daysabs ~ math, data = dta)

Residuals:
   Min     1Q Median     3Q    Max 
-9.046 -4.855 -2.560  1.623 42.596 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.11472    1.20429   7.569 4.23e-13 ***
math        -0.06779    0.02320  -2.922  0.00373 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.361 on 314 degrees of freedom
Multiple R-squared:  0.02648,   Adjusted R-squared:  0.02338 
F-statistic: 8.539 on 1 and 314 DF,  p-value: 0.003728

abs3 <- lm(daysabs~male+math,data=dta)
summary(abs3)


Call:
lm(formula = daysabs ~ male + math, data = dta)

Residuals:
   Min     1Q Median     3Q    Max 
-8.184 -4.474 -2.372  2.200 41.790 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.21191    1.28158   7.968 3.02e-14 ***
maleMale    -1.95673    0.82362  -2.376  0.01811 *  
math        -0.07073    0.02306  -3.067  0.00235 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 7.308 on 313 degrees of freedom
Multiple R-squared:  0.04372,   Adjusted R-squared:  0.03761 
F-statistic: 7.155 on 2 and 313 DF,  p-value: 0.0009153

abs4 <- lm(daysabs~male+math,data=subset(dta,school=="S01"))
summary(abs4)


Call:
lm(formula = daysabs ~ male + math, data = subset(dta, school == 
    "S01"))

Residuals:
    Min      1Q  Median      3Q     Max 
-10.759  -5.602  -2.375   2.629  36.695 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.98752    1.79502   6.678 4.02e-10 ***
maleMale    -4.22577    1.31925  -3.203  0.00165 ** 
math        -0.03720    0.03594  -1.035  0.30234    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.289 on 156 degrees of freedom
Multiple R-squared:  0.06807,   Adjusted R-squared:  0.05612 
F-statistic: 5.697 on 2 and 156 DF,  p-value: 0.004092

abs5 <- lm(daysabs~male+math,data=subset(dta,school=="S02"))
summary(abs5)


Call:
lm(formula = daysabs ~ male + math, data = subset(dta, school == 
    "S02"))

Residuals:
   Min     1Q Median     3Q    Max 
-3.707 -3.206 -1.657  0.794 37.372 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept)  3.553860   1.694079   2.098   0.0376 *
maleMale    -0.417732   0.846452  -0.494   0.6224  
math         0.001547   0.028584   0.054   0.9569  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.25 on 154 degrees of freedom
Multiple R-squared:  0.001618,  Adjusted R-squared:  -0.01135 
F-statistic: 0.1248 on 2 and 154 DF,  p-value: 0.8828

4.The following R script converts Taiwan dollars into US dollars. Modify it so that it converts US dollars to Euros.

# a function to convert US dollars to Euros dollars
transdollar <- function(x, xr)
  return(paste("$", format(round(x*100/xr, 0)/100, nsmall=2),
         sep=""))
# set the exchange rate
xr <- 1.13
# pick some random amount of money
x <- round(rnorm(10, mean=3000, sd=300))
# show conversion
data.frame(US=x, Euros=transdollar(x, xr))

     US    Euros
1  2865 $2535.40
2  2299 $2034.51
3  3400 $3008.85
4  2608 $2307.96
5  3145 $2783.19
6  3190 $2823.01
7  2593 $2294.69
8  3099 $2742.48
9  3107 $2749.56
10 2749 $2432.74

5.Thr formula P = L (r/(1-(1+r)^(-M)) describes the payment you have to make per month for M number of months if you take out a loan of L amount today at a monthly interest rate of r. Compute how much you will have to pay per month for 10, 15, 20, 25, or 30 years if you borrow NT5,000,000, 10,000,000, or 15,000,000 from a bank that charges you 2%, 5%, or 7% for the monthly interest rate.

P <- function(L,r,M){
     print(L*(r/(1-(1+r)^(-M))))
}
P(5000000,0.02,c(10,15,20,25,30))

[1] 556632.6 389127.4 305783.6 256102.2 223249.6

P(10000000,0.05,c(10,15,20,25,30))

[1] 1295045.7  963422.9  802425.9  709524.6  650514.4

P(15000000,0.07,c(10,15,20,25,30))

[1] 2135663 1646919 1415894 1287158 1208796

8.Use the data in the high schools example and write your own functions to solve the following problems
(a) test if any pairs of the five variables: read , write , math , science , and socst , are different in means.

hse <- read.table("data/hs0.txt", header=T)
Dif <- function(a,b){
    t.test(a,b)$p.val
}
Dif(hse$read,hse$write)

[1] 0.5812665

Dif(hse$read,hse$math)

[1] 0.6728327

Dif(hse$read,hse$science)

[1] 0.757241

Dif(hse$read,hse$socst)

[1] 0.8676815

Dif(hse$write,hse$math)

[1] 0.8903494

Dif(hse$write,hse$science)

[1] 0.3775863

Dif(hse$write,hse$socst)

[1] 0.7150322

Dif(hse$math,hse$science)

[1] 0.4515844

Dif(hse$math,hse$socst)

[1] 0.8118467

Dif(hse$science,hse$socst)

[1] 0.6377587

test if the 4 different ethnic groups have the same mean scores for each of the 5 variables (individually): read , write , math , science , and socst .

newhse <- na.omit(hse)
test <- function(c){
    aggregate(c, by=list(newhse$race), FUN=mean)
}

test(newhse$read)

       Group.1        x
1 african-amer 46.26316
2        asian 51.90909
3     hispanic 47.00000
4        white 53.73944

test(newhse$write)

       Group.1        x
1 african-amer 47.84211
2        asian 58.00000
3     hispanic 46.78261
4        white 53.87324

test(newhse$math)

       Group.1        x
1 african-amer 46.47368
2        asian 57.27273
3     hispanic 47.56522
4        white 53.81690

test(newhse$science)

       Group.1        x
1 african-amer 42.42105
2        asian 51.45455
3     hispanic 46.21739
4        white 54.14789

test(newhse$socst)

       Group.1        x
1 african-amer 49.36842
2        asian 51.00000
3     hispanic 48.04348
4        white 53.49296