Content

 MidIQR
 Robust Standard Deviation
 Median Absolute Deviation
 Standard Error
 Bootstrap of Standard Error
 Bootstrap of MAD with confidence intervals and standard error table
 Bootstrap Error of a percentile
 t-test
 median
 Calculate a Percentile
 Change numeric variables to factor variables
 step AIC using MASS
 split data into test and train
 alternate method to split data into test and train
set.seed(123);
data1=c(rep(NA,10),rcauchy(20),rnorm(970))

Middle Interquartile Range

# --------------- MidIQR ----------------

MidIQR <- function(dat){
  dat1 <- dat[!is.na(dat)] # remove empty values, place values in dat1
  qtls <- quantile(dat1,prob=c(0.25,0.75)) # list of .25 and .75 quantile values for dat1
  val <- mean(qtls) # mean of the two quantile values
  val # print said mean
}
MidIQR(data1)
## [1] 0.01790665

Robust Standard Deviation

# --------------- Robust Standard Deviation --------------

rsd <- function(dat){
  dat1 <- dat[!is.na(dat)]
  qtls <- quantile(dat1,prob=c(0.25,0.75))
  val <- (qtls[2]-qtls[1])/(qnorm(.75)-qnorm(.25)) # calculation of rsd
  names(val) <- NULL # would otherwise display "75%" above the data of interest
  val
}
rsd(data1)
## [1] 0.9593433

Median Absolute Deviation

# --------------- Median Absolute Deviation -----------

MAD <- function(data, na.rm=TRUE){
  if (na.rm) data=na.omit(data) #if na.rm=TRUE eliminate missing values
  median(abs(data-median(data)))*1.4826
}
MAD(data1)
## [1] 0.9630704

Standard Error

# --------------- Standard Error ------------------

library(MASS)
data("Boston")

se.medv <- sd(Boston$medv)/sqrt(nrow(Boston))

Standard Error by bootstrap

# --------------- Standard Error by bootstrap ----------------
data<-c(1,4,4,5,3,2,6,7,9,10,4,2) # example vector of data
index<-sample(1:length(data), rep=TRUE) #bootstrap sample
z=data[index] #bt sampling of data
mean(z) #mean estimation of bt sample 
## [1] 3.5
btest=NULL;B=1000;set.seed(123) # B is number of bootstrap iterations
for (i in 1:B) {
  index<-sample(1:length(data), rep=TRUE)
  z=data[index]
  btest[i]=mean(z)    
}
se=sd(btest)  

Bootstrap of MAD with confidence intervals and standard error table

# --------------- Bootstrap of MAD with confidence intervals and standard error table --------------------

data<-c(1,4,4,5,3,2,6,7,9,10,4,2) # example vector of data
thetahat=mad(data) # value of statistical test results on unsampled data (mad estimation), used for comparison later

btest = NULL # empty vector of statistical test results
B=20 # number of times will run bootstrap test
set.seed(12) # set seed for "random selection"

for(i in 1:B){
  index <- sample(1:length(data), replace = TRUE) # create a bootsrap index vector, replace okay
  z = data[index] # vector of values of bootstrap
  btest[i] = mad(z) # add result of statistical text to index i of btest
}

se = sd(btest) # standard error (standard deviation of bootstrap test results)
ci = c(thetahat - 1.96*se, thetahat + 1.96*se) # confidence interval

ci # print confidence interval
## [1] 1.171716 4.758684
list(StdError = se, ConfInterval = ci) # prints table with labels StdError, ConfInterval1, ConfInterval2
## $StdError
## [1] 0.915043
## 
## $ConfInterval
## [1] 1.171716 4.758684

Bootstrap Error of a percentile

# --------------- Bootstrap Error of a percentile -----------------

index<-sample(1:length(data), rep=TRUE) #bootstrap sample
z=data[index] #bt sampling of data
quantile(z, .1) #10th quantile estimation of bt sample  
## 10% 
##   2
##  10% 
## 11.8
btest=NULL;B=1000;set.seed(123)
for (i in 1:B) {
  index<-sample(1:length(data), rep=TRUE)
  z=data[index]
  btest[i]=quantile(z, .1)     
}
se=sd(btest)  #SE of quantile
se # 0.5017864
## [1] 0.7789935
## [1] 0.5017864

t test

# --------------- t test  --------------------

t.test(Boston$medv)
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281
## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

Median

# --------------- Median -----------------

median.medv=median(data)

Calculate a Percentile

# --------------- Calculate a Percentile ---------------

quant.medv=quantile(data, .1) #print 10th percentile

Change numeric variables to factor variables

# --------------- Change numeric variables to factor variables -------------------

library(ElemStatLearn)
data = SAheart

saHeart <- SAheart # Make new data so as to not corrupt original data
saHeart$chd <- factor(SAheart$chd, labels = c("No", "Yes")) # 0 = "No", 1 = "Yes" because goes in order least to greatest
dim(saHeart) # 462   10
## [1] 462  10
summary(saHeart)
##       sbp           tobacco             ldl           adiposity    
##  Min.   :101.0   Min.   : 0.0000   Min.   : 0.980   Min.   : 6.74  
##  1st Qu.:124.0   1st Qu.: 0.0525   1st Qu.: 3.283   1st Qu.:19.77  
##  Median :134.0   Median : 2.0000   Median : 4.340   Median :26.11  
##  Mean   :138.3   Mean   : 3.6356   Mean   : 4.740   Mean   :25.41  
##  3rd Qu.:148.0   3rd Qu.: 5.5000   3rd Qu.: 5.790   3rd Qu.:31.23  
##  Max.   :218.0   Max.   :31.2000   Max.   :15.330   Max.   :42.49  
##     famhist        typea         obesity         alcohol      
##  Absent :270   Min.   :13.0   Min.   :14.70   Min.   :  0.00  
##  Present:192   1st Qu.:47.0   1st Qu.:22.98   1st Qu.:  0.51  
##                Median :53.0   Median :25.80   Median :  7.51  
##                Mean   :53.1   Mean   :26.04   Mean   : 17.04  
##                3rd Qu.:60.0   3rd Qu.:28.50   3rd Qu.: 23.89  
##                Max.   :78.0   Max.   :46.58   Max.   :147.19  
##       age         chd     
##  Min.   :15.00   No :302  
##  1st Qu.:31.00   Yes:160  
##  Median :45.00            
##  Mean   :42.82            
##  3rd Qu.:55.00            
##  Max.   :64.00

step AIC using MASS

# --------------- step AIC using MASS -----------------
data(package="ElemStatLearn")
?SAheart
## starting httpd help server ...
##  done
# The data SAheart is South African Hearth Disease Data.
# The variable chd is response variable with 1: coronary heart disease and 0: no. 

library(MASS) # has the stepAIC function
data=saHeart
saSample = sample(1:462, .7*462)
chdLM <- glm(saHeart$chd ~ ., family = binomial, data = saHeart) # set general linear model with all variables
stepChdLM <- stepAIC(chdLM, trace = T) # Looks for lowest AIC
## Start:  AIC=492.14
## saHeart$chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + 
##     obesity + alcohol + age
## 
##             Df Deviance    AIC
## - alcohol    1   472.14 490.14
## - adiposity  1   472.55 490.55
## - sbp        1   473.44 491.44
## <none>           472.14 492.14
## - obesity    1   474.23 492.23
## - ldl        1   481.07 499.07
## - tobacco    1   481.67 499.67
## - typea      1   483.05 501.05
## - age        1   486.53 504.53
## - famhist    1   488.89 506.89
## 
## Step:  AIC=490.14
## saHeart$chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + 
##     obesity + age
## 
##             Df Deviance    AIC
## - adiposity  1   472.55 488.55
## - sbp        1   473.47 489.47
## <none>           472.14 490.14
## - obesity    1   474.24 490.24
## - ldl        1   481.15 497.15
## - tobacco    1   482.06 498.06
## - typea      1   483.06 499.06
## - age        1   486.64 502.64
## - famhist    1   488.99 504.99
## 
## Step:  AIC=488.55
## saHeart$chd ~ sbp + tobacco + ldl + famhist + typea + obesity + 
##     age
## 
##           Df Deviance    AIC
## - sbp      1   473.98 487.98
## <none>         472.55 488.55
## - obesity  1   474.65 488.65
## - tobacco  1   482.54 496.54
## - ldl      1   482.95 496.95
## - typea    1   483.19 497.19
## - famhist  1   489.38 503.38
## - age      1   495.48 509.48
## 
## Step:  AIC=487.98
## saHeart$chd ~ tobacco + ldl + famhist + typea + obesity + age
## 
##           Df Deviance    AIC
## - obesity  1   475.69 487.69
## <none>         473.98 487.98
## - tobacco  1   484.18 496.18
## - typea    1   484.30 496.30
## - ldl      1   484.53 496.53
## - famhist  1   490.58 502.58
## - age      1   502.11 514.11
## 
## Step:  AIC=487.69
## saHeart$chd ~ tobacco + ldl + famhist + typea + age
## 
##           Df Deviance    AIC
## <none>         475.69 487.69
## - ldl      1   484.71 494.71
## - typea    1   485.44 495.44
## - tobacco  1   486.03 496.03
## - famhist  1   492.09 502.09
## - age      1   502.38 512.38

Split data into test and train

# --------------- split data into test and train -----------------
library(ISLR)
data("College")
n=nrow(College)
trainIndex = sample(1:n, size = round(0.7*n), replace = FALSE)
train = College[trainIndex,]
test = College[-trainIndex,]

head(train)
##                           Private Apps Accept Enroll Top10perc Top25perc
## Virginia State University      No 2996   2440    704         2        30
## Eureka College                Yes  560    454    113        36        56
## Presbyterian College          Yes 1082    832    302        34        63
## Hillsdale College             Yes  920    745    347        35        66
## Hanover College               Yes 1006    837    317        33        65
## Simpson College               Yes 1016    872    300        27        57
##                           F.Undergrad P.Undergrad Outstate Room.Board
## Virginia State University        3006         338     5587       4845
## Eureka College                    484          16    10955       3450
## Presbyterian College             1133          30    11859       3635
## Hillsdale College                1133          42    11090       4700
## Hanover College                  1024          15     8200       3485
## Simpson College                  1116         602    11250       4980
##                           Books Personal PhD Terminal S.F.Ratio
## Virginia State University   500      600  61       63      16.0
## Eureka College              330      670  62       87      10.6
## Presbyterian College        554     1429  80       85      13.4
## Hillsdale College           400      750  80       80      12.0
## Hanover College             500     1200  84       84      10.6
## Simpson College             550     1400  66       73      15.8
##                           perc.alumni Expend Grad.Rate
## Virginia State University          11   5733        31
## Eureka College                     31   9552        53
## Presbyterian College               42   8354        85
## Hillsdale College                  31  12639        79
## Hanover College                    26   9248        64
## Simpson College                    36   7411        70
head(test)
##                           Private Apps Accept Enroll Top10perc Top25perc
## Agnes Scott College           Yes  417    349    137        60        89
## Alaska Pacific University     Yes  193    146     55        16        44
## Albertson College             Yes  587    479    158        38        62
## Albright College              Yes 1038    839    227        30        63
## Alfred University             Yes 1732   1425    472        37        75
## Allegheny College             Yes 2652   1900    484        44        77
##                           F.Undergrad P.Undergrad Outstate Room.Board
## Agnes Scott College               510          63    12960       5450
## Alaska Pacific University         249         869     7560       4120
## Albertson College                 678          41    13500       3335
## Albright College                  973         306    15595       4400
## Alfred University                1830         110    16548       5406
## Allegheny College                1707          44    17080       4440
##                           Books Personal PhD Terminal S.F.Ratio
## Agnes Scott College         450      875  92       97       7.7
## Alaska Pacific University   800     1500  76       72      11.9
## Albertson College           500      675  67       73       9.4
## Albright College            300      500  79       84      11.3
## Alfred University           500      600  82       88      11.3
## Allegheny College           400      600  73       91       9.9
##                           perc.alumni Expend Grad.Rate
## Agnes Scott College                37  19016        59
## Alaska Pacific University           2  10922        15
## Albertson College                  11   9727        55
## Albright College                   23  11644        80
## Alfred University                  31  10932        73
## Allegheny College                  41  11711        76

Another method to split data into test and train

# --------------- alternate method to split data into test and train --------------- 

library(caTools) # for splitting the dataset
library(ElemStatLearn);data(spam) # spam data
# create test and train datasets
split = sample.split(spam$spam, SplitRatio = 2/3)
training_set = subset(spam, split == TRUE) # 3068  58
test_set = subset(spam, split == FALSE) # 1533  58

# --------------- 

FIN