Content
MidIQR
Robust Standard Deviation
Median Absolute Deviation
Standard Error
Bootstrap of Standard Error
Bootstrap of MAD with confidence intervals and standard error table
Bootstrap Error of a percentile
t-test
median
Calculate a Percentile
Change numeric variables to factor variables
step AIC using MASS
split data into test and train
alternate method to split data into test and train
set.seed(123);
data1=c(rep(NA,10),rcauchy(20),rnorm(970))
Middle Interquartile Range
# --------------- MidIQR ----------------
MidIQR <- function(dat){
dat1 <- dat[!is.na(dat)] # remove empty values, place values in dat1
qtls <- quantile(dat1,prob=c(0.25,0.75)) # list of .25 and .75 quantile values for dat1
val <- mean(qtls) # mean of the two quantile values
val # print said mean
}
MidIQR(data1)
## [1] 0.01790665
Robust Standard Deviation
# --------------- Robust Standard Deviation --------------
rsd <- function(dat){
dat1 <- dat[!is.na(dat)]
qtls <- quantile(dat1,prob=c(0.25,0.75))
val <- (qtls[2]-qtls[1])/(qnorm(.75)-qnorm(.25)) # calculation of rsd
names(val) <- NULL # would otherwise display "75%" above the data of interest
val
}
rsd(data1)
## [1] 0.9593433
Standard Error
# --------------- Standard Error ------------------
library(MASS)
data("Boston")
se.medv <- sd(Boston$medv)/sqrt(nrow(Boston))
Standard Error by bootstrap
# --------------- Standard Error by bootstrap ----------------
data<-c(1,4,4,5,3,2,6,7,9,10,4,2) # example vector of data
index<-sample(1:length(data), rep=TRUE) #bootstrap sample
z=data[index] #bt sampling of data
mean(z) #mean estimation of bt sample
## [1] 3.5
btest=NULL;B=1000;set.seed(123) # B is number of bootstrap iterations
for (i in 1:B) {
index<-sample(1:length(data), rep=TRUE)
z=data[index]
btest[i]=mean(z)
}
se=sd(btest)
Bootstrap of MAD with confidence intervals and standard error table
# --------------- Bootstrap of MAD with confidence intervals and standard error table --------------------
data<-c(1,4,4,5,3,2,6,7,9,10,4,2) # example vector of data
thetahat=mad(data) # value of statistical test results on unsampled data (mad estimation), used for comparison later
btest = NULL # empty vector of statistical test results
B=20 # number of times will run bootstrap test
set.seed(12) # set seed for "random selection"
for(i in 1:B){
index <- sample(1:length(data), replace = TRUE) # create a bootsrap index vector, replace okay
z = data[index] # vector of values of bootstrap
btest[i] = mad(z) # add result of statistical text to index i of btest
}
se = sd(btest) # standard error (standard deviation of bootstrap test results)
ci = c(thetahat - 1.96*se, thetahat + 1.96*se) # confidence interval
ci # print confidence interval
## [1] 1.171716 4.758684
list(StdError = se, ConfInterval = ci) # prints table with labels StdError, ConfInterval1, ConfInterval2
## $StdError
## [1] 0.915043
##
## $ConfInterval
## [1] 1.171716 4.758684
Bootstrap Error of a percentile
# --------------- Bootstrap Error of a percentile -----------------
index<-sample(1:length(data), rep=TRUE) #bootstrap sample
z=data[index] #bt sampling of data
quantile(z, .1) #10th quantile estimation of bt sample
## 10%
## 2
## 10%
## 11.8
btest=NULL;B=1000;set.seed(123)
for (i in 1:B) {
index<-sample(1:length(data), rep=TRUE)
z=data[index]
btest[i]=quantile(z, .1)
}
se=sd(btest) #SE of quantile
se # 0.5017864
## [1] 0.7789935
## [1] 0.5017864
t test
# --------------- t test --------------------
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
Calculate a Percentile
# --------------- Calculate a Percentile ---------------
quant.medv=quantile(data, .1) #print 10th percentile
Change numeric variables to factor variables
# --------------- Change numeric variables to factor variables -------------------
library(ElemStatLearn)
data = SAheart
saHeart <- SAheart # Make new data so as to not corrupt original data
saHeart$chd <- factor(SAheart$chd, labels = c("No", "Yes")) # 0 = "No", 1 = "Yes" because goes in order least to greatest
dim(saHeart) # 462 10
## [1] 462 10
summary(saHeart)
## sbp tobacco ldl adiposity
## Min. :101.0 Min. : 0.0000 Min. : 0.980 Min. : 6.74
## 1st Qu.:124.0 1st Qu.: 0.0525 1st Qu.: 3.283 1st Qu.:19.77
## Median :134.0 Median : 2.0000 Median : 4.340 Median :26.11
## Mean :138.3 Mean : 3.6356 Mean : 4.740 Mean :25.41
## 3rd Qu.:148.0 3rd Qu.: 5.5000 3rd Qu.: 5.790 3rd Qu.:31.23
## Max. :218.0 Max. :31.2000 Max. :15.330 Max. :42.49
## famhist typea obesity alcohol
## Absent :270 Min. :13.0 Min. :14.70 Min. : 0.00
## Present:192 1st Qu.:47.0 1st Qu.:22.98 1st Qu.: 0.51
## Median :53.0 Median :25.80 Median : 7.51
## Mean :53.1 Mean :26.04 Mean : 17.04
## 3rd Qu.:60.0 3rd Qu.:28.50 3rd Qu.: 23.89
## Max. :78.0 Max. :46.58 Max. :147.19
## age chd
## Min. :15.00 No :302
## 1st Qu.:31.00 Yes:160
## Median :45.00
## Mean :42.82
## 3rd Qu.:55.00
## Max. :64.00
step AIC using MASS
# --------------- step AIC using MASS -----------------
data(package="ElemStatLearn")
?SAheart
## starting httpd help server ...
## done
# The data SAheart is South African Hearth Disease Data.
# The variable chd is response variable with 1: coronary heart disease and 0: no.
library(MASS) # has the stepAIC function
data=saHeart
saSample = sample(1:462, .7*462)
chdLM <- glm(saHeart$chd ~ ., family = binomial, data = saHeart) # set general linear model with all variables
stepChdLM <- stepAIC(chdLM, trace = T) # Looks for lowest AIC
## Start: AIC=492.14
## saHeart$chd ~ sbp + tobacco + ldl + adiposity + famhist + typea +
## obesity + alcohol + age
##
## Df Deviance AIC
## - alcohol 1 472.14 490.14
## - adiposity 1 472.55 490.55
## - sbp 1 473.44 491.44
## <none> 472.14 492.14
## - obesity 1 474.23 492.23
## - ldl 1 481.07 499.07
## - tobacco 1 481.67 499.67
## - typea 1 483.05 501.05
## - age 1 486.53 504.53
## - famhist 1 488.89 506.89
##
## Step: AIC=490.14
## saHeart$chd ~ sbp + tobacco + ldl + adiposity + famhist + typea +
## obesity + age
##
## Df Deviance AIC
## - adiposity 1 472.55 488.55
## - sbp 1 473.47 489.47
## <none> 472.14 490.14
## - obesity 1 474.24 490.24
## - ldl 1 481.15 497.15
## - tobacco 1 482.06 498.06
## - typea 1 483.06 499.06
## - age 1 486.64 502.64
## - famhist 1 488.99 504.99
##
## Step: AIC=488.55
## saHeart$chd ~ sbp + tobacco + ldl + famhist + typea + obesity +
## age
##
## Df Deviance AIC
## - sbp 1 473.98 487.98
## <none> 472.55 488.55
## - obesity 1 474.65 488.65
## - tobacco 1 482.54 496.54
## - ldl 1 482.95 496.95
## - typea 1 483.19 497.19
## - famhist 1 489.38 503.38
## - age 1 495.48 509.48
##
## Step: AIC=487.98
## saHeart$chd ~ tobacco + ldl + famhist + typea + obesity + age
##
## Df Deviance AIC
## - obesity 1 475.69 487.69
## <none> 473.98 487.98
## - tobacco 1 484.18 496.18
## - typea 1 484.30 496.30
## - ldl 1 484.53 496.53
## - famhist 1 490.58 502.58
## - age 1 502.11 514.11
##
## Step: AIC=487.69
## saHeart$chd ~ tobacco + ldl + famhist + typea + age
##
## Df Deviance AIC
## <none> 475.69 487.69
## - ldl 1 484.71 494.71
## - typea 1 485.44 495.44
## - tobacco 1 486.03 496.03
## - famhist 1 492.09 502.09
## - age 1 502.38 512.38
Split data into test and train
# --------------- split data into test and train -----------------
library(ISLR)
data("College")
n=nrow(College)
trainIndex = sample(1:n, size = round(0.7*n), replace = FALSE)
train = College[trainIndex,]
test = College[-trainIndex,]
head(train)
## Private Apps Accept Enroll Top10perc Top25perc
## Virginia State University No 2996 2440 704 2 30
## Eureka College Yes 560 454 113 36 56
## Presbyterian College Yes 1082 832 302 34 63
## Hillsdale College Yes 920 745 347 35 66
## Hanover College Yes 1006 837 317 33 65
## Simpson College Yes 1016 872 300 27 57
## F.Undergrad P.Undergrad Outstate Room.Board
## Virginia State University 3006 338 5587 4845
## Eureka College 484 16 10955 3450
## Presbyterian College 1133 30 11859 3635
## Hillsdale College 1133 42 11090 4700
## Hanover College 1024 15 8200 3485
## Simpson College 1116 602 11250 4980
## Books Personal PhD Terminal S.F.Ratio
## Virginia State University 500 600 61 63 16.0
## Eureka College 330 670 62 87 10.6
## Presbyterian College 554 1429 80 85 13.4
## Hillsdale College 400 750 80 80 12.0
## Hanover College 500 1200 84 84 10.6
## Simpson College 550 1400 66 73 15.8
## perc.alumni Expend Grad.Rate
## Virginia State University 11 5733 31
## Eureka College 31 9552 53
## Presbyterian College 42 8354 85
## Hillsdale College 31 12639 79
## Hanover College 26 9248 64
## Simpson College 36 7411 70
head(test)
## Private Apps Accept Enroll Top10perc Top25perc
## Agnes Scott College Yes 417 349 137 60 89
## Alaska Pacific University Yes 193 146 55 16 44
## Albertson College Yes 587 479 158 38 62
## Albright College Yes 1038 839 227 30 63
## Alfred University Yes 1732 1425 472 37 75
## Allegheny College Yes 2652 1900 484 44 77
## F.Undergrad P.Undergrad Outstate Room.Board
## Agnes Scott College 510 63 12960 5450
## Alaska Pacific University 249 869 7560 4120
## Albertson College 678 41 13500 3335
## Albright College 973 306 15595 4400
## Alfred University 1830 110 16548 5406
## Allegheny College 1707 44 17080 4440
## Books Personal PhD Terminal S.F.Ratio
## Agnes Scott College 450 875 92 97 7.7
## Alaska Pacific University 800 1500 76 72 11.9
## Albertson College 500 675 67 73 9.4
## Albright College 300 500 79 84 11.3
## Alfred University 500 600 82 88 11.3
## Allegheny College 400 600 73 91 9.9
## perc.alumni Expend Grad.Rate
## Agnes Scott College 37 19016 59
## Alaska Pacific University 2 10922 15
## Albertson College 11 9727 55
## Albright College 23 11644 80
## Alfred University 31 10932 73
## Allegheny College 41 11711 76
Another method to split data into test and train
# --------------- alternate method to split data into test and train ---------------
library(caTools) # for splitting the dataset
library(ElemStatLearn);data(spam) # spam data
# create test and train datasets
split = sample.split(spam$spam, SplitRatio = 2/3)
training_set = subset(spam, split == TRUE) # 3068 58
test_set = subset(spam, split == FALSE) # 1533 58
# ---------------
FIN