https://mollyaredtana.wixsite.com/sta108

Instruction

Note: the final project will be posted online for your classmates to view.

The final project is team work done by teams of up to four students.

You may use functions in the midterm reports from classmates in other teams. There are no penalties for using others’ code as long as you acknowledge them properly. Specifically, the tutorial is eligible for being nominated as the outstanding project, regardless where the code comes from. However, you need to acknowledge them in the Acknowledgement section. In addition, you will vote in the final survey to acknowledge the original coders.

Failing to acknowledge the use of other’s code will be counted as plagiarism. The final project will not be graded, and this incidence might be reported to the Student Judicial Affairs.

The class will be asked to vote for top tutorials. In addition, the instructors will nominate outstanding tutorials based on the quality of the tutorial, and creative content — suggestions made by the instructors will not be counted as your creativity.

Description

Create a video tutorial to illustrate the use of your functions, using either the flu shot data or the Project STAR data as an example.

  1. The video should be 10 minutes long and in MP4 format. (See UCD resources here.)

  2. As a minimum requirement, you need to walk through the functions that, not necessarily in the following order,

# all functions needed for this proj
# From Gitbook

fit.linear.model<-function(covariate,outcome){
  # I will write down the function for (multiple) linear regression here
  X=cbind(1,covariate);
  beta.fit=solve( t(X)%*%X )%*%t(X)%*%outcome;
  return(beta.fit)
}


sum.of.squares<-function(beta,covariate,outcome){
  # it intakes 3 parameters
  # beta, covariate -> Advertising_Data$radio, outcome -> Advertising$sales
  yout=linear.model(beta=beta,covariate=covariate);# build a simple linear model here
  res=outcome-yout; # and use the outome to minus the one we just got
  sos= sum(res^2); # square it
  return(sos)
}

estimate.sigma.sq<-function(beta,covariate,outcome){
   residual.sum.of.squares=sum.of.squares(beta=beta,covariate=covariate,outcome=outcome)
   n=length(outcome)
   sigma.sq.hat=residual.sum.of.squares/(n-2) 
   return(sigma.sq.hat)
}

estimate.coef.var<-function(beta,covariate,outcome){
  sigma.sq.hat=estimate.sigma.sq(beta,covariate,outcome)
  var.hat.beta=beta;
  var.hat.beta[2]=sigma.sq.hat/sum(  (covariate-mean(covariate))^2  ) 
  n=length(outcome)
  var.hat.beta[1]=sigma.sq.hat*sum(covariate^2)/sum((covariate-mean(covariate))^2  )/n
  return( var.hat.beta)
}

estimate.coef.sd<-function(beta,covariate,outcome){
  # In this report we set covariate to be Advertising$radio and outcome to be
  # Advertising$sales
  var.hat.beta=estimate.coef.var(beta,covariate,outcome)
 sd.hat.beta=sqrt(var.hat.beta);
  return(sd.hat.beta)
}


linear.model<-function(beta,covariate){
  yout=covariate*beta[2]+beta[1]
  return(yout);
}


conf.int.quantile<-function(alpha,type,...){
  if(type=="t"){
    out=qt(c(1-alpha/2,alpha/2), ... ) 
  }else if (type=="normal"){
    out=qnorm(c(1-alpha/2,alpha/2), ... ) 
  }
  return(out)
}

conf.int<-function(alpha,type,covariate,outcome,B=1e5){
  #  we use le5 which means the number it means 10 to the 5th power
  beta.hat=fit.linear.model(covariate,outcome);
  # then we fit the linear regression model
  beta.sd=estimate.coef.sd(beta=beta.hat,covariate,outcome);
  # and we call estimate.coef.ed() to get the beta.standard error
  if(type=='bootstrap'){ # check which type is it
    beta.hat.boot=replicate(B,boot.fit(covariate,outcome));
    out=t(apply(beta.hat.boot[1,,],1,quantile,probs=c(alpha/2,1-alpha/2)));
  }else if(type=='t'){# check which type is it
    quants<-conf.int.quantile(alpha,type='t',df=n-2)
    out=beta.hat%*%c(1,1)-beta.sd%*%quants;
  }else{# check which type is it
    quants<-conf.int.quantile(alpha,type='normal')
    out=beta.hat%*%c(1,1)-beta.sd%*%quants;
  }
  
  colnames(out)=c( paste(round(alpha*50,digits=3),'%'), paste(100-round(alpha*50,digits=3),'%')  )
  return(out)
}

calculate.pvalue<-function(covariate,outcome,type){
  beta.hat.t=calculate.t(covariate,outcome);
  if (type=='t'){
    n=length(outcome);
    pval=2*apply(cbind(pt(beta.hat.t,df=n-2),(1-pt(beta.hat.t,df=n-2))),1,min);

  }else if (type=='z'){
    pval=2*apply(cbind(pnorm(-abs(beta.hat.t)),(1-pnorm(abs(beta.hat.t)))),1,min);
    
  }else if (type == 'bootstrap'){
    
    beta.hat=fit.linear.model(covariate=covariate,outcome=outcome)
    beta.hat.boot=replicate(1e5,boot.fit(covariate=covariate,outcome=outcome));
    pval=numeric(length(beta.hat));
    for(i in 1:length(beta.hat)){
      boot.est=beta.hat.boot[1,i,]
      pval[i]=2*min( mean(0<boot.est), mean(0>boot.est) )
    }
  }
  return(pval)
}


calculate.t<-function(covariate,outcome){
  beta.hat=fit.linear.model(covariate=covariate,outcome=outcome)
  beta.hat.sd=estimate.coef.sd(beta=beta.hat,covariate=covariate,outcome=outcome)
  beta.hat.t = (beta.hat-0)/beta.hat.sd;
  return(beta.hat.t)
}


simulate.one.instance<-function(x,beta.null,alpha,type){
  n=length(x);
  Ey= x*beta.null[2]+beta.null[1];
  error.terms= rnorm(n)*2;
  y=Ey+error.terms;
  pval=calculate.pvalue(covariate=x,outcome=y,type=type);
  rej.flag= pval[2]<alpha
  return(rej.flag)
}

Read flu-shot data in

# input package tidyverse
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# read data in
FLU <- read.table("flu240.txt", header = TRUE)
(a) fit a linear regression
# Gitbook

# First way : R built in function

fit.flu1 = lm(outcome~treatment.assigned+1,data=FLU)

# Then summry it

summary(fit.flu1)
## 
## Call:
## lm(formula = outcome ~ treatment.assigned + 1, data = FLU)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.09168 -0.09168 -0.07817 -0.07817  0.92183 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.091684   0.007425  12.348   <2e-16 ***
## treatment.assigned -0.013517   0.010364  -1.304    0.192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2785 on 2889 degrees of freedom
## Multiple R-squared:  0.0005885,  Adjusted R-squared:  0.0002426 
## F-statistic: 1.701 on 1 and 2889 DF,  p-value: 0.1922
# Second way : Gitbook Functions

# fit a simple linear regression model
beta.hat = fit.linear.model(covariate = FLU$treatment.assigned, outcome = FLU$outcome)
# display it
beta.hat
##                  [,1]
##            0.09168443
## covariate -0.01351732
# plot it
ggplot(FLU, aes(x=treatment.assigned, y=outcome)) + 
  geom_jitter(width = 0.25)+
  geom_smooth(method = "lm") + theme_minimal(base_size = 18) + labs(title = "The Fiting Regression Model of The Data FLU")
## `geom_smooth()` using formula 'y ~ x'

  1. construct confidence intervals for regression coefficients
# Let's assume that we want to construct an 98.5% confidence interval

# R built-in functions

alpha = 0.015

CI1 = confint(fit.flu1,level=1-alpha)


# Gitbook functions
n = 2891
x = FLU$treatment.assigned
y = FLU$outcome
alpha = 0.015
CI2 = conf.int(alpha=alpha,type='t',covariate=x,outcome=y)


# display both

CI1
##                         0.75 %    99.25 %
## (Intercept)         0.07361284 0.10975603
## treatment.assigned -0.03874072 0.01170608
CI2
##                0.75 %    99.25 %
##            0.07361284 0.10975603
## covariate -0.03874072 0.01170608
  1. conduct z-test for hypothesis that one coefficient is zero
# Calculate t
t = calculate.t(x, y)
# Calculate p
p = calculate.pvalue(x, y, "z")

#display t, p

t
##                [,1]
##           12.347822
## covariate -1.304302
p
##           covariate 
## 0.0000000 0.1921306
p[2]
## covariate 
## 0.1921306
  1. perform multiple testing correction using Bonferroni correction
# From gitbook
m = 1000 # we want to test 1000 hypothesis


# Conduct the same test 
pvalues=numeric(m)

pvalues=numeric(m)
for (i in 1:m){
  pval=calculate.pvalue(covariate=FLU$treatment.assigned,outcome=FLU$outcome,type='z');
  pvalues[i]<-pval[2];
}

alpha.Bonf=alpha/m; # divided by the number of hypotheses
rej.flag=pvalues<alpha.Bonf;

## True positive
sum(rej.flag & ((1:m)<101))
## [1] 0
## False positive 
sum(rej.flag & ((1:m)>100))
## [1] 0
## True negative
sum(!rej.flag & ((1:m)>100))
## [1] 900
## False negative
sum(!rej.flag & ((1:m)<101))
## [1] 100
  1. and visualize the raw data as well as the fitted results.
library(tidyverse)
#install.packages("FSA")
library(FSA)
## ## FSA v0.8.30. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.
headtail(FLU)
##      treatment.assigned treatment.received outcome age copd heart.disease
## 1                     1                  1       0  73    0             1
## 2                     0                  1       0  65    0             0
## 3                     0                  1       0  77    1             1
## 2889                  0                  0       0  69    0             1
## 2890                  1                  0       0  76    0             1
## 2891                  0                  0       0  70    0             0
##      female
## 1         1
## 2         1
## 3         1
## 2889      1
## 2890      1
## 2891      1
FLU$Bonferroni = p.adjust(FLU$treatment.assigned, method = "bonferroni")


# Raw Data
ggplot(FLU, aes(x=treatment.assigned, y=outcome)) + geom_point()

ggplot(FLU, aes(x=treatment.assigned, y=outcome)) + geom_jitter(width = 0.25)

# Fitted
beta.hat
##                  [,1]
##            0.09168443
## covariate -0.01351732
plot(beta.hat)

  1. You also need to explain your findings in the data analysis. In particular, we will look for one of the following.
  1. A conclusion on the association between flu-related hospital visits and whether the patient received the flu shot, while adjusting for the patient’s age and gender;
# 正确答案
library(tidyverse)
dat.flu = read.table("flu240.txt", header = TRUE)
# Fit a multiple linear regression
fit.flu= lm(outcome~treatment.received+age+female+1,data=dat.flu); 
summary(fit.flu)
## 
## Call:
## lm(formula = outcome ~ treatment.received + age + female + 1, 
##     data = dat.flu)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13399 -0.09470 -0.07821 -0.07204  0.94318 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.1433464  0.0279772   5.124 3.19e-07 ***
## treatment.received  0.0001502  0.0120033   0.013   0.9900    
## age                -0.0006684  0.0004088  -1.635   0.1022    
## female             -0.0225113  0.0110036  -2.046   0.0409 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2784 on 2887 degrees of freedom
## Multiple R-squared:  0.002419,   Adjusted R-squared:  0.001383 
## F-statistic: 2.334 on 3 and 2887 DF,  p-value: 0.07209
# Summarize the fitted results
summary(fit.flu) 
## 
## Call:
## lm(formula = outcome ~ treatment.received + age + female + 1, 
##     data = dat.flu)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13399 -0.09470 -0.07821 -0.07204  0.94318 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.1433464  0.0279772   5.124 3.19e-07 ***
## treatment.received  0.0001502  0.0120033   0.013   0.9900    
## age                -0.0006684  0.0004088  -1.635   0.1022    
## female             -0.0225113  0.0110036  -2.046   0.0409 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2784 on 2887 degrees of freedom
## Multiple R-squared:  0.002419,   Adjusted R-squared:  0.001383 
## F-statistic: 2.334 on 3 and 2887 DF,  p-value: 0.07209
plot(fit.flu)

# 我写的

M1 <- cbind(FLU$treatment.received, FLU$age, FLU$female)
M2 <- FLU$outcome

#beta_hat2 = fit.linear.model(cbind(FLU$treatment.received, FLU$age,x3_dummy,x4_dummy), y)

fit = fit.linear.model(M1, M2)
fit
##               [,1]
## [1,]  0.1433463876
## [2,]  0.0001501796
## [3,] -0.0006683895
## [4,] -0.0225112650
#summary(fit)
#coefficients(fit) # model coefficients
#confint(fit, level=0.95) # CIs for model parameters
#fitted(fit) # predicted values
#residuals(fit) # residuals
#anova(fit) # anova table
#vcov(fit) # covariance matrix for model parameters
#influence(fit) # regression diagnostics

#beta.hat = fit.linear.model(dat.advertising.radio, dat.advertising.sales);
fitted.received = fit[1]+fit[2]*M1;
unknown.var.estimator = sum((M2 - fitted.received)^2) / (length(M1) - 2);
error.sum.square = sum((M1-mean(M2))^2)
intercept.se.estimator = sqrt(unknown.var.estimator*(1/length(M1)+mean(M1)^2/error.sum.square)) # Standard Error for Intercept
slope.se.estimator = sqrt(unknown.var.estimator/error.sum.square) # Standard Error for Covariate
beta.hat.se = c(intercept.se.estimator, slope.se.estimator)
beta.hat.t = (fit-0) / beta.hat.se
beta.hat.p = 2 * pt(-beta.hat.t, df = length(M1)-2)
cbind(fit, beta.hat.se, beta.hat.t, beta.hat.p)
##                     beta.hat.se                        
## [1,]  0.1433463876 3.535522e-03   40.5446164 0.00000000
## [2,]  0.0001501796 7.995943e-05    1.8781973 0.06038771
## [3,] -0.0006683895 3.535522e-03   -0.1890497 1.14994174
## [4,] -0.0225112650 7.995943e-05 -281.5335695 2.00000000
#plot(fit) + abline(0, 1, col=1,lty=2,lwd=1)

#p1 = calculate.pvalue(M1, M2, "z")
#p1
#p1[2]




x2 = FLU$female
x3=FLU$age
x4=FLU$treatment.received

x2_dummy = model.matrix(~x2)[,-1]
x3_dummy = model.matrix(~x3)[,-1]
x4_dummy = model.matrix(~x4)[,-1]
beta_hat2 = fit.linear.model(cbind(x2_dummy,x3_dummy,x4_dummy), FLU$outcome)
beta_hat2[1:5]
## [1]  0.1433463876 -0.0225112650 -0.0006683895  0.0001501796            NA
summary(beta_hat2)
##        V1            
##  Min.   :-0.0225113  
##  1st Qu.:-0.0061291  
##  Median :-0.0002591  
##  Mean   : 0.0300792  
##  3rd Qu.: 0.0359492  
##  Max.   : 0.1433464
an=0.1433463876-0.0225112650*x2-0.0006683895*x3+0.0001501796*x4


la=y-an
summary(la)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.13399 -0.09470 -0.07821  0.00000 -0.07204  0.94318
calculate.pvalue(cbind(x2_dummy,x3_dummy,x4_dummy), FLU$outcome, "z")
## Warning in sqrt(var.hat.beta): NaNs produced
##                  x2_dummy     x3_dummy     x4_dummy 
## 0.0000323767 0.0000000000          NaN 0.9902223463
calculate.pvalue(cbind(x2_dummy,x3_dummy,x4_dummy), FLU$outcome, "t")
## Warning in sqrt(var.hat.beta): NaNs produced
##                    x2_dummy      x3_dummy      x4_dummy 
##  3.331729e-05 4.849652e-317           NaN  9.902232e-01
# cited others' midterm report

M = cbind(x2_dummy,x3_dummy,x4_dummy)

fitted.received = beta_hat2[1]+beta_hat2[2]*M;
unknown.var.estimator = sum((FLU$outcome - M)^2) / (length(M) - 2);
error.sum.square = sum((M-mean(FLU$outcome))^2)
intercept.se.estimator = sqrt(unknown.var.estimator*(1/length(M)+mean(M)^2/error.sum.square)); # Standard Error for Intercept
slope.se.estimator = sqrt(unknown.var.estimator/error.sum.square); # Standard Error for Covariate
beta.hat.se = c(intercept.se.estimator, slope.se.estimator);
beta.hat.t = (beta_hat2-0) / beta.hat.se
beta.hat.p = 2 * pt(-beta.hat.t, df = length(M)-2)
cbind(beta_hat2, beta.hat.se, beta.hat.t, beta.hat.p)
##                        beta.hat.se                      
##           0.1433463876   0.4748671  0.30186633 0.7627612
## x2_dummy -0.0225112650   0.0107396 -2.09609840 1.9638976
## x3_dummy -0.0006683895   0.4748671 -0.00140753 1.0011230
## x4_dummy  0.0001501796   0.0107396  0.01398372 0.9888433
# By creating a new dataframe, we can see the relationship between treatment.assigned, treatment.received, age, and gender more clearly.

# For the hospital visit, I assume that treatment.assigned is the visit amount since the treatment 
A_G_A_R <- FLU %>% select(treatment.assigned, treatment.received, age, outcome) %>% arrange(age) %>%
    mutate(Gender = ifelse(FLU$female == 0, 0, 1)) %>%
    mutate(Gender = factor(Gender, labels = c("Male", "Female")))
# I use select() to only include the variables I want. And I use arrange to sort the age
# For the gender, we can see that there is only one column called female in the data frame so I used the mutate functions to create a new column called Gender to define it to be Female or Male which will be easier for us to observe.


# display it
#A_G_A_R
# Age_Gender_treatment.assigned_treatment.received

Based on my understanding to this data and the new dataframe I created, my guess is most of the people will not get the flu shot even if the physicians got a reminder of the flu shot based on the rate of the treatment.assigned and treatment.received.

Let us check my guess.

Total = 2891 # there are 2891 patients in total

TA = 0 # treatment_assigned
TR = 0 # treatment_received

i = 0 # count


for(i in A_G_A_R$treatment.assigned)
{
    if(i == 1)
    {
        TA = TA + 1;
    }
    else
    {
        TR = TR + 1;
    }
    i = i + 1;
}

# displat TA and TR
print("The amount of people who had been assigned of treatment is")
## [1] "The amount of people who had been assigned of treatment is"
TA # treatment_assigned
## [1] 1484
print("The amount of people who had received the treatment is")
## [1] "The amount of people who had received the treatment is"
TR # treatment_received
## [1] 1407
# Find the rate of it

Rate_TA = (TA /Total) * 100
Rate_TR = (TR /Total) * 100

# displat TA and TR
print("The percentage of people who had been assigned of treatment is")
## [1] "The percentage of people who had been assigned of treatment is"
Rate_TA
## [1] 51.33172
print("The percentage of people who had received the treatment is")
## [1] "The percentage of people who had received the treatment is"
Rate_TR
## [1] 48.66828
print(" All percentages data are in % ")
## [1] " All percentages data are in % "

Based on the data I got, we can found out that the rate of treatment.assigned is actually slightly higher than the treatment.received. In other words, some people their physicians got the reminder to provide the flu shot for them but some of them didn’t go to the hospitals to take it.

Next, I am going to split the patients into 2 groups, patients who actually received the flu shot whatever they got the treatment.assigned or not and the other group should contains the patients who didn’t receive the flu shot. Since we only care about the relationship between the actual outcome and the function of the flu shot.

P_R = TR # People who actually received the flu shot 
# Since I have already calculated the treatment.received above so I can just C & P here.

# Now, we can calculate the outcome amount and rate

got_flu_with_treatment = 0 # People who got the flu and they received the flu shot
got_flu_without_treatment = 0 # People who got the flu and they didn't receive the flu shot
N_got_flu_with_treatment = 0 # People who didn't end up with the flu with flu shot
N_got_flu_without_treatment = 0 # People who didn't end up with the flu and without flu shot

# In order to make sure the data is limited in the people who received the flu shot
# we need to make treatment.received column to be a vector

# treatment_received
tr = vector()
count = 1
for(each_tr in A_G_A_R$treatment.received)
{
    tr[count] = each_tr;
    count = count + 1;
}

count = 1 # keep track of the tr vector
# it is not like c++ or other languages, the index in r actually starts from 1


for(i in A_G_A_R$outcome)
{
    if(i == 1) # ended up with the flu
    {
        if(tr[count] == 1) # received flu shot
        {
            got_flu_with_treatment = got_flu_with_treatment + 1;
        }
        else # didn't receive flu shot
        {
            got_flu_without_treatment = got_flu_without_treatment + 1;
        }
    }
    else # didn't end up with flu
    {
        if(tr[count] == 1) # received flu shot
        {
            N_got_flu_with_treatment = N_got_flu_with_treatment + 1;
        }
        else # didn't receive flu shot
        {
            N_got_flu_without_treatment = N_got_flu_without_treatment + 1;
        }
        
    }
    
    count = count + 1;
}


print("There are 2891 patients in total")
## [1] "There are 2891 patients in total"
print("The number of patients who actually received the flu shot is")
## [1] "The number of patients who actually received the flu shot is"
P_R# People who actually received the flu shot 
## [1] 1407
print("The number of patients who got the flu and they received the flu shot is")
## [1] "The number of patients who got the flu and they received the flu shot is"
got_flu_with_treatment # People who got the flu and they received the flu shot
## [1] 61
print("The number of patients who got the flu and they didn't receive the flu shot is")
## [1] "The number of patients who got the flu and they didn't receive the flu shot is"
got_flu_without_treatment # People who got the flu and they didn't receive the flu shot
## [1] 184
print("The number of patients who didn't end up with the flu with flu shot is")
## [1] "The number of patients who didn't end up with the flu with flu shot is"
N_got_flu_with_treatment # People who didn't end up with the flu with flu shot
## [1] 661
print("The number of patients who didn't end up with the flu and without flu shot is")
## [1] "The number of patients who didn't end up with the flu and without flu shot is"
N_got_flu_without_treatment # People who didn't end up with the flu and without flu shot
## [1] 1985
print("The percentage of the patients who received the flu shot but still got the flu is")
## [1] "The percentage of the patients who received the flu shot but still got the flu is"
rate_R_Flu = (got_flu_with_treatment / P_R) * 100
rate_R_Flu
## [1] 4.335466
print("The percentage of the patients who got the flu and they didn't receive the flu shot is")
## [1] "The percentage of the patients who got the flu and they didn't receive the flu shot is"
rate_N_R_Flu = (got_flu_without_treatment / P_R) * 100
rate_N_R_Flu
## [1] 13.07747
print("!!! All percentages data are in % !!!")
## [1] "!!! All percentages data are in % !!!"

Based on that, we can actually find out the relationship between the flu shot and the outcome really easy. There are 2891 patients in total and 1407 which is nearly half of them received the flu shot in hospital. In this amount of 1407 of people, there are only 61 people who actually got the flu even though they took the flu shot and it only occupies 4.34% in 1407 people. Though the rate is high in some ways, it is kind of fine. On the other side, we can see that there are 184 people who ended up with the flu without getting the flu shots. 184 is the triple of 61 adn the percentage of these people are 13.08% which is really high. Comparing to that, we can see there are 661 patients who didn’t get the flu because they receive the flu shot. However, there are more people have the ability to cure themselves even without the flu shot which occupies the main part of the total amount of patients. Afterall, flu is not a big deal.

The next step is analyze how the age affect the flu.

Get the mean age first.

# the Mean of the age

Mean_Age = 0 # set it to be 0 at first
a = 0 # count
sum = 0 # sum of age
Median_age  = median(A_G_A_R$age)
for(a in A_G_A_R$age)
{
    sum = sum + a;
}

# display sum
sum
## [1] 188593
# find the mean_age
Mean_Age = sum / Total

print("mean age is")
## [1] "mean age is"
Mean_Age
## [1] 65.23452
print("Median Age is")
## [1] "Median Age is"
Median_age
## [1] 67
# Since the median age and the mean age are almost same so we can take the middle number between the mean age and the median age

Split_Age = 66
# set it to be 66

People beyond or equal to 66 are old

People below 66 are young

# make it to be the vector, easier to keep track of it
# age
AGE = vector()
count = 1 # reuse it
for(each_a in A_G_A_R$age)
{
    AGE[count] = each_a;
    count = count + 1;
}




OLD = 0
YOUNG = 0;
count = 1;

for(i in A_G_A_R$age)
{
    if(i >= Split_Age & tr[count] == 1)
    {
        OLD = OLD + 1;
    }
    else if(i < Split_Age & tr[count] == 1)
    {
        YOUNG = YOUNG + 1;
    }
    count = count + 1;
}

print(" The total amount of the old is ")
## [1] " The total amount of the old is "
OLD
## [1] 447
print(" The total amount of the young is ")
## [1] " The total amount of the young is "
YOUNG
## [1] 275
count = 1 # keep track of AGE and tr
Old_R_Flu = 0 # The old received the flu shot but still ended up with the flu
Young_R_Flu = 0 # The young received the flu shot but still ended up with the flu

for( i in A_G_A_R$outcome)
{
    if(i == 1 & tr[count] == 1)# got flu but received flu
    {
        if(AGE[count] >= Split_Age)# old
        {
            Old_R_Flu = Old_R_Flu + 1;
        }
        else # young
        {
            Young_R_Flu = Young_R_Flu + 1;
        }
    }
    
    count = count + 1;
}

print("The old received the flu shot but still ended up with the flu")
## [1] "The old received the flu shot but still ended up with the flu"
Old_R_Flu # The old received the flu shot but still ended up with the flu
## [1] 32
print("he young received the flu shot but still ended up with the flu")
## [1] "he young received the flu shot but still ended up with the flu"
Young_R_Flu # The young received the flu shot but still ended up with the flu
## [1] 29

In the amount of 447 of the old who received, there are only 32 people got the flu even they took the flu shot. And there are only 29 people in 275 of young who received the flu even they took the flu shot. From that, we can also see a fact that the immunity of young is way better than the old and that’s why I recommend the old should get more vaccines to protect themselves.

However, these data wouldn’t be able to tell a big difference just based on the data above.

Last step the gender.

G = vector()
count = 1 # count
FEMALE = 0 # Total amout of Female who received the flu shot
MALE = 0 # Total amount of Male who received the flu shot
for(each_g in A_G_A_R$Gender)
{
    G[count] = each_g;
    
    if(each_g == "Female" & tr[count] == 1)
    {
        FEMALE = FEMALE + 1;
    }
    else if(each_g == "Male" & tr[count] == 1)
    {
        MALE = MALE + 1;
    }
    count = count + 1;
}



print(" The total amount of the Female who received the flu shot is")
## [1] " The total amount of the Female who received the flu shot is"
FEMALE
## [1] 475
print(" The total amount of the Male who received the flu shot is")
## [1] " The total amount of the Male who received the flu shot is"
MALE
## [1] 247
Female_R_Flu = 0 # Female got flu but received flu
Male_R_Flu = 0 # Male got flu but received flu


count = 1 

for( i in A_G_A_R$outcome)
{
    if(i == 1 & G[count] == "Female" & tr[count] == 1) # Female got flu but received flu
    {
        Female_R_Flu = Female_R_Flu + 1;
    }
    else if(i == 1 & G[count] == "Male" & tr[count] == 1) # Male got flu but received flu shot
    {
        Male_R_Flu = Male_R_Flu + 1;
    }
    
    count = count + 1;
}


print("The amount of Female who got flu but received flu shot")
## [1] "The amount of Female who got flu but received flu shot"
Female_R_Flu
## [1] 44
print("The amount of Male got flu but received flu shot")
## [1] "The amount of Male got flu but received flu shot"
Male_R_Flu
## [1] 17

Although there are more Female than Male who got flu and received flu shot, the total amount of Female is the double of the Male.

  1. A conclusion on the association between the class type (small, regular, regular with aide) and the students’ math grades in the first grade, while adjusting for student’s ethnicity, and school type.
  1. No specific styles of the tutorials are required. You are encouraged to be creative, but you will not be penalized so long as you fulfill the requirements in (1) – (3). For instance, you may just open Rstudio and walk through the code while explaining the usage of the functions.

Acknowladge:

Functions from Gitbook and other people’s Midterm Report.