# Shade T
shadet = function(below=NULL, above=NULL, pcts = c(0.025,0.975), df=1, numpts = 500, color = "gray", dens = 40, justabove= FALSE, justbelow = FALSE, lines=FALSE,between=NULL,outside=NULL){
if(is.null(between)){
below = ifelse(is.null(below), qt(pcts[1],df), below)
above = ifelse(is.null(above), qt(pcts[2],df), above)
}
if(is.null(outside)==FALSE){
below = min(outside)
above = max(outside)
}
lowlim = -4
uplim = 4
x.grid = seq(lowlim,uplim, length= numpts)
dens.all = dt(x.grid,df)
if(lines==FALSE){
plot(x.grid, dens.all, type="l", xlab="X", ylab="Density")
}
if(lines==TRUE){
lines(x.grid,dens.all)
}
if(justabove==FALSE){
x.below = x.grid[x.grid<below]
dens.below = dens.all[x.grid<below]
polygon(c(x.below,rev(x.below)),c(rep(0,length(x.below)),rev(dens.below)),col=color,density=dens)
}
if(justbelow==FALSE){
x.above = x.grid[x.grid>above]
dens.above = dens.all[x.grid>above]
polygon(c(x.above,rev(x.above)),c(rep(0,length(x.above)),rev(dens.above)),col=color,density=dens)
}
if(is.null(between)==FALSE){
from = min(between)
to = max(between)
x.between = x.grid[x.grid>from&x.grid<to]
dens.between = dens.all[x.grid>from&x.grid<to]
polygon(c(x.between,rev(x.between)),c(rep(0,length(x.between)),rev(dens.between)),col=color,density=dens)
}
}
# MYP
myp=function(p, alpha){
if(p<alpha){print('REJECT Ho')}else{print('FAIL 2 REJECT')}
}
# P(IQ>=110)
# mean =100
# sd =16
pnorm(110,
mean = 100,
sd = 16,
lower.tail = FALSE)
## [1] 0.2659855
# P(IQ>=110)
# mean =110
# sd =16
pnorm(110,
mean = 110,
sd = 16,
lower.tail = FALSE)
## [1] 0.5
Step 1 Ho: >= 50% of American adults did not attend college due to financial reasons Ha: < 50% of American adults did not attend college due to financial reasons
Step 2 alpha = 0.05
phat <- 0.48 # sample proportion
p <- 0.5 # population hypothesized value
q <- 1 - p
alpha <- 0.05
n <- 331
df <- n-1
Se <- sqrt(p * q / n)
t <- (phat - p) / Se
teststat <- t
teststat
## [1] -0.7277362
criticalval <- qt(p = 0.05,df = n-1)
criticalval
## [1] -1.649484
shadet(df = n-1,
pcts = c(0.05),
col = 'lightblue'
)
?lines
lines(x = rep(x = teststat,
times = 10),
y = seq(from = 0, to = 20,length.out=10),
col='gray')
pval <- pnorm(t)
pval
## [1] 0.2333875
alpha
## [1] 0.05
myp(p = pval, alpha = alpha)
## [1] "FAIL 2 REJECT"
P value > alpha, fail to reject null hypothesis - not enough evidence to support the newspaper’s claim that only a minority of American adults do not go to college due to financial reasons.
SE = sqrt((0.48*(1-0.48))/331)
SE
## [1] 0.02746049
Standard error of 2.7% in a sample proportion of 48% means that there is 95% confidence interval. The 0.05 is contained within this confidence interval.
Hypotheses Ho: Difference of two means (distracted and undistracted group) is zero. Ha: Difference of two means (distracted and undistracted group) is not zero.
Level of Significance alpha = 0.05
Distracted = Group 1 Undistracted = Group 2
# Ho: Mu1-Mu2=0,
# Ha: Mu1-Mu2<>0
mu1 <- 4.9
mu2 <- 6.1
alpha = 0.05
# dist = t
n1 = 22
n2 = 22
df1 = n1 - 1
df2 = n2 - 1
sd1 = 1.8
sd2 = 1.8
var1 = sd1^2
var2 = sd2^2
pointestdiff = (mu1 - mu2)
denSe = sqrt(var1/n1 + var2/n2)
t = (pointestdiff - 0) / denSe
teststat <- t
teststat
## [1] -2.211083
numdf = (var1/n1 + var2/n2)^2
dendf = (var1/n1)^2 / df1 + (var2/n2)^2 / df2
df = numdf / dendf
criticalvalsatter <- qt(p = (1-alpha/2), df = numdf / dendf)
criticalvalsatter
## [1] 2.018082
?min
criticalvalcons <- qt(p = (1-alpha/2), df = min(df1, df2))
criticalvalcons
## [1] 2.079614
shadet(df = df,
pcts = c(0.05,0.95),
col = 'lightblue')
?seq
lines(x = rep(t,10),
y = seq(from = 0,
to = 1,
length.out = 10),
col='gray'
)
?pt
pval = 2*(1 - pt(q = t,
df = numdf / dendf)
)
pval
## [1] 1.967472
myp(p = pval,alpha = alpha)
## [1] "FAIL 2 REJECT"
pvalconservative = 2*( 1 - pt(q = t,
df = min(df1, df2)
))
pvalconservative
## [1] 1.961743
myp(p = pvalconservative,alpha = alpha)
## [1] "FAIL 2 REJECT"
CI <- c(pointestdiff - (criticalvalcons * denSe),
pointestdiff + (criticalvalcons * denSe))
CI
## [1] -2.3286489 -0.0713511
Difference of two means (distracted and undistracted groups) is significantly different from zero, Fail to reject null.
obs <- 25
samplemean <- (65+77)/2
marginerror <-(77-65)/2
tdf <- round(qt(c(0.05, 0.95), df=24)[2], 3)
se <- round((77-samplemean)/tdf, 3)
sd <- se * sqrt(25)
samplemean
## [1] 71
marginerror
## [1] 6
sd
## [1] 17.535
mym <- matrix(c(4,30,24,45), nrow=2)
colnames(mym) <- c("control", "treatment")
rownames(mym) <- c("alive","dead")
mym
## control treatment
## alive 4 24
## dead 30 45
Hypotheses Ho: Heart transplant affected survival rate of patients Ha: Heart transplant did not affect survival rate of patients
Normal distribution works well for large quantities of observations. However, the provided variables are discrete and few. We are also not provided with the raw observed data, but instead the summations of the data, which makes it inaccurate if we were to calculate a confidence interval. A confidence interval would not be valid in this case and may provide a false sense of security when drawing conclusions using this data.
titanic <- read.csv("/Users/bellajean/Downloads/train.csv")
head(titanic)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
tail(titanic)
## PassengerId Survived Pclass Name Sex
## 886 886 0 3 Rice, Mrs. William (Margaret Norton) female
## 887 887 0 2 Montvila, Rev. Juozas male
## 888 888 1 1 Graham, Miss. Margaret Edith female
## 889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female
## 890 890 1 1 Behr, Mr. Karl Howell male
## 891 891 0 3 Dooley, Mr. Patrick male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 886 39 0 5 382652 29.125 Q
## 887 27 0 0 211536 13.000 S
## 888 19 0 0 112053 30.000 B42 S
## 889 NA 1 2 W./C. 6607 23.450 S
## 890 26 0 0 111369 30.000 C148 C
## 891 32 0 0 370376 7.750 Q
cor(titanic$Age, titanic$Survived)
## [1] NA
cor(titanic$Pclass, titanic$Survived)
## [1] -0.338481
library("psych")
describe(titanic)
## vars n mean sd median trimmed mad min max range
## PassengerId 1 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Survived 2 891 0.38 0.49 0.00 0.35 0.00 0.00 1.00 1.00
## Pclass 3 891 2.31 0.84 3.00 2.39 0.00 1.00 3.00 2.00
## Name* 4 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Sex* 5 891 1.65 0.48 2.00 1.68 0.00 1.00 2.00 1.00
## Age 6 714 29.70 14.53 28.00 29.27 13.34 0.42 80.00 79.58
## SibSp 7 891 0.52 1.10 0.00 0.27 0.00 0.00 8.00 8.00
## Parch 8 891 0.38 0.81 0.00 0.18 0.00 0.00 6.00 6.00
## Ticket* 9 891 339.52 200.83 338.00 339.65 268.35 1.00 681.00 680.00
## Fare 10 891 32.20 49.69 14.45 21.38 10.24 0.00 512.33 512.33
## Cabin* 11 891 18.63 38.14 1.00 8.29 0.00 1.00 148.00 147.00
## Embarked* 12 891 3.53 0.80 4.00 3.66 0.00 1.00 4.00 3.00
## skew kurtosis se
## PassengerId 0.00 -1.20 8.62
## Survived 0.48 -1.77 0.02
## Pclass -0.63 -1.28 0.03
## Name* 0.00 -1.20 8.62
## Sex* -0.62 -1.62 0.02
## Age 0.39 0.16 0.54
## SibSp 3.68 17.73 0.04
## Parch 2.74 9.69 0.03
## Ticket* 0.00 -1.28 6.73
## Fare 4.77 33.12 1.66
## Cabin* 2.09 3.07 1.28
## Embarked* -1.27 -0.16 0.03
set.seed(100)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
smalltitanic <- sample_n(titanic, 500)
relation <- lm(smalltitanic$Survived~smalltitanic$Age,
data=smalltitanic)
print(summary(relation))
##
## Call:
## lm(formula = smalltitanic$Survived ~ smalltitanic$Age, data = smalltitanic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4837 -0.4074 -0.3439 0.5894 0.7132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.486850 0.054650 8.908 <2e-16 ***
## smalltitanic$Age -0.003176 0.001651 -1.924 0.0551 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4873 on 403 degrees of freedom
## (95 observations deleted due to missingness)
## Multiple R-squared: 0.009101, Adjusted R-squared: 0.006642
## F-statistic: 3.701 on 1 and 403 DF, p-value: 0.05507
plot(smalltitanic$Age,
smalltitanic$Survived,
data = smalltitanic,
col = "blue",
main = "Relationship of Survival and Age",
abline(relation),
xlab = "Age",
ylab = "Survived Y/N")
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not a
## graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
prediction <- predict(relation, newdata=smalltitanic)
?ifelse
expected <- ifelse(prediction>0.5, yes=1, no=0)
observed <- smalltitanic$Survived
outcometable <- table(observed,expected)
outcometable
## expected
## observed 0
## 0 246
## 1 159
correctlypredicted <- (outcometable[0,1] + outcometable[1,0]) / sum(outcometable)
correctlypredicted
## numeric(0)
Linearity: the parameters being estimated are linear because the variables are binary.
Random: the data is randomly sampled from the population.
Non-Collinearity: the variables being calculated are not perfectly correlated with each other, age and survival do not have an perfectly correlated relationship
Exogeneity: the variables are independent of each other.
Homoscedasticity: the errors in the data are constant.
What could be done to improve this data is to have a more thorough clean-up.