Final Exam

Required functions

# Shade T
shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40,   justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){

    if(is.null(between)){
         below = ifelse(is.null(below), qt(pcts[1],df), below)
         above = ifelse(is.null(above), qt(pcts[2],df), above)
    }
    if(is.null(outside)==FALSE){
         below = min(outside)
         above = max(outside)
    }
  
    lowlim = -4
    uplim  = 4
    x.grid = seq(lowlim,uplim, length= numpts)
    dens.all = dt(x.grid,df)
    
    if(lines==FALSE){
          plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
    }

    if(lines==TRUE){
          lines(x.grid,dens.all)
    }
    
    if(justabove==FALSE){
        x.below    = x.grid[x.grid<below]
        dens.below = dens.all[x.grid<below]
        polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
    }
    if(justbelow==FALSE){
        x.above    = x.grid[x.grid>above]
        dens.above = dens.all[x.grid>above]
        polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
    }
    
    if(is.null(between)==FALSE){
         from = min(between)
         to   = max(between)
         x.between    = x.grid[x.grid>from&x.grid<to]
         dens.between = dens.all[x.grid>from&x.grid<to]
         polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
    }
}

# MYP
myp=function(p, alpha){
  if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}

Question 1

1a

# P(IQ>=110)
# mean =100
# sd   =16

pnorm(110, 
      mean = 100, 
      sd = 16, 
      lower.tail = FALSE)

## [1] 0.2659855

1b

# P(IQ>=110)
# mean =110
# sd   =16

pnorm(110, 
      mean = 110, 
      sd = 16, 
      lower.tail = FALSE)

## [1] 0.5

Question 2

1a

Step 1 Ho: >= 50% of American adults did not attend college due to financial reasons Ha: < 50% of American adults did not attend college due to financial reasons

Step 2 alpha = 0.05

phat  <-  0.48    # sample proportion
p     <-  0.5     # population hypothesized value
q     <-  1 - p

alpha <- 0.05
n     <- 331
df    <- n-1

Se <- sqrt(p * q / n)   

t  <- (phat - p) / Se  
teststat <- t
teststat

## [1] -0.7277362

criticalval <- qt(p = 0.05,df = n-1)
criticalval

## [1] -1.649484

shadet(df = n-1, 
       pcts = c(0.05),
       col = 'lightblue'
       )
?lines
lines(x = rep(x = teststat, 
              times = 10), 
      y = seq(from = 0, to = 20,length.out=10), 
      col='gray')

pval <- pnorm(t) 
pval

## [1] 0.2333875

alpha

## [1] 0.05

myp(p = pval, alpha = alpha)

## [1] "FAIL 2 REJECT"

P value > alpha, fail to reject null hypothesis - not enough evidence to support the newspaper’s claim that only a minority of American adults do not go to college due to financial reasons.

2b

SE = sqrt((0.48*(1-0.48))/331)
SE

## [1] 0.02746049

Standard error of 2.7% in a sample proportion of 48% means that there is 95% confidence interval. The 0.05 is contained within this confidence interval.

Question 3

Hypotheses Ho: Difference of two means (distracted and undistracted group) is zero. Ha: Difference of two means (distracted and undistracted group) is not zero.

Level of Significance alpha = 0.05

Distracted = Group 1 Undistracted = Group 2

# Ho: Mu1-Mu2=0, 
# Ha: Mu1-Mu2<>0

mu1 <- 4.9
mu2 <- 6.1

alpha = 0.05

# dist = t
n1    =   22
n2    =   22

df1   =   n1 - 1
df2   =   n2 - 1

sd1   = 1.8
sd2   = 1.8

var1  = sd1^2
var2  = sd2^2

pointestdiff = (mu1 - mu2) 
denSe = sqrt(var1/n1 + var2/n2)
t   = (pointestdiff - 0) / denSe      
teststat <- t
teststat

## [1] -2.211083

numdf = (var1/n1 + var2/n2)^2                       
dendf = (var1/n1)^2 / df1 + (var2/n2)^2 / df2       
df = numdf / dendf                                  

criticalvalsatter <- qt(p = (1-alpha/2), df = numdf / dendf)
criticalvalsatter

## [1] 2.018082

?min
criticalvalcons <- qt(p = (1-alpha/2), df = min(df1, df2))
criticalvalcons

## [1] 2.079614

shadet(df = df, 
       pcts = c(0.05,0.95),
       col = 'lightblue')
?seq
lines(x = rep(t,10), 
      y = seq(from = 0, 
              to = 1, 
              length.out = 10),
      col='gray'
        )

?pt
pval = 2*(1 - pt(q = t, 
                 df = numdf / dendf)
                )    
pval

## [1] 1.967472

myp(p = pval,alpha = alpha)

## [1] "FAIL 2 REJECT"

pvalconservative = 2*( 1 - pt(q = t, 
                              df = min(df1, df2)
))
pvalconservative

## [1] 1.961743

myp(p = pvalconservative,alpha = alpha)

## [1] "FAIL 2 REJECT"

CI <- c(pointestdiff - (criticalvalcons * denSe), 
        pointestdiff + (criticalvalcons * denSe))
CI

## [1] -2.3286489 -0.0713511

Difference of two means (distracted and undistracted groups) is significantly different from zero, Fail to reject null.

Question 4

obs         <- 25
samplemean  <- (65+77)/2
marginerror <-(77-65)/2
tdf         <- round(qt(c(0.05, 0.95), df=24)[2], 3)
se          <- round((77-samplemean)/tdf, 3)
sd          <- se * sqrt(25)

samplemean

## [1] 71

marginerror

## [1] 6

sd

## [1] 17.535

Question 5

mym <- matrix(c(4,30,24,45), nrow=2)
colnames(mym) <- c("control", "treatment")
rownames(mym) <- c("alive","dead")
mym

##       control treatment
## alive       4        24
## dead       30        45

Hypotheses Ho: Heart transplant affected survival rate of patients Ha: Heart transplant did not affect survival rate of patients

Normal distribution works well for large quantities of observations. However, the provided variables are discrete and few. We are also not provided with the raw observed data, but instead the summations of the data, which makes it inaccurate if we were to calculate a confidence interval. A confidence interval would not be valid in this case and may provide a false sense of security when drawing conclusions using this data.

Question 6

titanic <- read.csv("/Users/bellajean/Downloads/train.csv")
head(titanic)

##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

tail(titanic)

##     PassengerId Survived Pclass                                     Name    Sex
## 886         886        0      3     Rice, Mrs. William (Margaret Norton) female
## 887         887        0      2                    Montvila, Rev. Juozas   male
## 888         888        1      1             Graham, Miss. Margaret Edith female
## 889         889        0      3 Johnston, Miss. Catherine Helen "Carrie" female
## 890         890        1      1                    Behr, Mr. Karl Howell   male
## 891         891        0      3                      Dooley, Mr. Patrick   male
##     Age SibSp Parch     Ticket   Fare Cabin Embarked
## 886  39     0     5     382652 29.125              Q
## 887  27     0     0     211536 13.000              S
## 888  19     0     0     112053 30.000   B42        S
## 889  NA     1     2 W./C. 6607 23.450              S
## 890  26     0     0     111369 30.000  C148        C
## 891  32     0     0     370376  7.750              Q

6.1

cor(titanic$Age, titanic$Survived)

## [1] NA

cor(titanic$Pclass, titanic$Survived)

## [1] -0.338481

6.2

library("psych")
describe(titanic)

##             vars   n   mean     sd median trimmed    mad  min    max  range
## PassengerId    1 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Survived       2 891   0.38   0.49   0.00    0.35   0.00 0.00   1.00   1.00
## Pclass         3 891   2.31   0.84   3.00    2.39   0.00 1.00   3.00   2.00
## Name*          4 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Sex*           5 891   1.65   0.48   2.00    1.68   0.00 1.00   2.00   1.00
## Age            6 714  29.70  14.53  28.00   29.27  13.34 0.42  80.00  79.58
## SibSp          7 891   0.52   1.10   0.00    0.27   0.00 0.00   8.00   8.00
## Parch          8 891   0.38   0.81   0.00    0.18   0.00 0.00   6.00   6.00
## Ticket*        9 891 339.52 200.83 338.00  339.65 268.35 1.00 681.00 680.00
## Fare          10 891  32.20  49.69  14.45   21.38  10.24 0.00 512.33 512.33
## Cabin*        11 891  18.63  38.14   1.00    8.29   0.00 1.00 148.00 147.00
## Embarked*     12 891   3.53   0.80   4.00    3.66   0.00 1.00   4.00   3.00
##              skew kurtosis   se
## PassengerId  0.00    -1.20 8.62
## Survived     0.48    -1.77 0.02
## Pclass      -0.63    -1.28 0.03
## Name*        0.00    -1.20 8.62
## Sex*        -0.62    -1.62 0.02
## Age          0.39     0.16 0.54
## SibSp        3.68    17.73 0.04
## Parch        2.74     9.69 0.03
## Ticket*      0.00    -1.28 6.73
## Fare         4.77    33.12 1.66
## Cabin*       2.09     3.07 1.28
## Embarked*   -1.27    -0.16 0.03

6.3

set.seed(100)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

smalltitanic <- sample_n(titanic, 500)

relation <- lm(smalltitanic$Survived~smalltitanic$Age,
               data=smalltitanic)
print(summary(relation))

## 
## Call:
## lm(formula = smalltitanic$Survived ~ smalltitanic$Age, data = smalltitanic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4837 -0.4074 -0.3439  0.5894  0.7132 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.486850   0.054650   8.908   <2e-16 ***
## smalltitanic$Age -0.003176   0.001651  -1.924   0.0551 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4873 on 403 degrees of freedom
##   (95 observations deleted due to missingness)
## Multiple R-squared:  0.009101,   Adjusted R-squared:  0.006642 
## F-statistic: 3.701 on 1 and 403 DF,  p-value: 0.05507

6.4

plot(smalltitanic$Age,
     smalltitanic$Survived,
     data = smalltitanic,
     col  = "blue",
     main = "Relationship of Survival and Age",
     abline(relation),
     xlab = "Age",
     ylab = "Survived Y/N")

## Warning in plot.window(...): "data" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter

## Warning in box(...): "data" is not a graphical parameter

## Warning in title(...): "data" is not a graphical parameter

6.5

prediction <- predict(relation, newdata=smalltitanic)
?ifelse
expected <- ifelse(prediction>0.5, yes=1, no=0)

6.6

observed <- smalltitanic$Survived
outcometable <- table(observed,expected)
outcometable

##         expected
## observed   0
##        0 246
##        1 159

6.7

correctlypredicted <- (outcometable[0,1] + outcometable[1,0]) / sum(outcometable)
correctlypredicted

## numeric(0)

6.8

Linearity: the parameters being estimated are linear because the variables are binary.

Random: the data is randomly sampled from the population.

Non-Collinearity: the variables being calculated are not perfectly correlated with each other, age and survival do not have an perfectly correlated relationship

Exogeneity: the variables are independent of each other.

Homoscedasticity: the errors in the data are constant.

The linear regression is not very accurate in prediction

What could be done to improve this data is to have a more thorough clean-up.