More Basic Stat Functions

Content

 MidIQR
 Robust Standard Deviation
 Median Absolute Deviation
 Standard Error
 Bootstrap of Standard Error
 Bootstrap of MAD with confidence intervals and standard error table
 Bootstrap Error of a percentile
 t-test
 median
 Calculate a Percentile
 Change numeric variables to factor variables
 step AIC using MASS
 split data into test and train
 alternate method to split data into test and train

set.seed(123);
data1=c(rep(NA,10),rcauchy(20),rnorm(970))

Middle Interquartile Range

# --------------- MidIQR ----------------

MidIQR <- function(dat){
  dat1 <- dat[!is.na(dat)] # remove empty values, place values in dat1
  qtls <- quantile(dat1,prob=c(0.25,0.75)) # list of .25 and .75 quantile values for dat1
  val <- mean(qtls) # mean of the two quantile values
  val # print said mean
}
MidIQR(data1)

## [1] 0.01790665

Robust Standard Deviation

# --------------- Robust Standard Deviation --------------

rsd <- function(dat){
  dat1 <- dat[!is.na(dat)]
  qtls <- quantile(dat1,prob=c(0.25,0.75))
  val <- (qtls[2]-qtls[1])/(qnorm(.75)-qnorm(.25)) # calculation of rsd
  names(val) <- NULL # would otherwise display "75%" above the data of interest
  val
}
rsd(data1)

## [1] 0.9593433

Median Absolute Deviation

# --------------- Median Absolute Deviation -----------

MAD <- function(data, na.rm=TRUE){
  if (na.rm) data=na.omit(data) #if na.rm=TRUE eliminate missing values
  median(abs(data-median(data)))*1.4826
}
MAD(data1)

## [1] 0.9630704

Standard Error

# --------------- Standard Error ------------------

library(MASS)
data("Boston")

se.medv <- sd(Boston$medv)/sqrt(nrow(Boston))

Standard Error by bootstrap

# --------------- Standard Error by bootstrap ----------------
data<-c(1,4,4,5,3,2,6,7,9,10,4,2) # example vector of data
index<-sample(1:length(data), rep=TRUE) #bootstrap sample
z=data[index] #bt sampling of data
mean(z) #mean estimation of bt sample

## [1] 3.5

btest=NULL;B=1000;set.seed(123) # B is number of bootstrap iterations
for (i in 1:B) {
  index<-sample(1:length(data), rep=TRUE)
  z=data[index]
  btest[i]=mean(z)    
}
se=sd(btest)

Bootstrap of MAD with confidence intervals and standard error table

# --------------- Bootstrap of MAD with confidence intervals and standard error table --------------------

data<-c(1,4,4,5,3,2,6,7,9,10,4,2) # example vector of data
thetahat=mad(data) # value of statistical test results on unsampled data (mad estimation), used for comparison later

btest = NULL # empty vector of statistical test results
B=20 # number of times will run bootstrap test
set.seed(12) # set seed for "random selection"

for(i in 1:B){
  index <- sample(1:length(data), replace = TRUE) # create a bootsrap index vector, replace okay
  z = data[index] # vector of values of bootstrap
  btest[i] = mad(z) # add result of statistical text to index i of btest
}

se = sd(btest) # standard error (standard deviation of bootstrap test results)
ci = c(thetahat - 1.96*se, thetahat + 1.96*se) # confidence interval

ci # print confidence interval

## [1] 1.171716 4.758684

list(StdError = se, ConfInterval = ci) # prints table with labels StdError, ConfInterval1, ConfInterval2

## $StdError
## [1] 0.915043
## 
## $ConfInterval
## [1] 1.171716 4.758684

Bootstrap Error of a percentile

# --------------- Bootstrap Error of a percentile -----------------

index<-sample(1:length(data), rep=TRUE) #bootstrap sample
z=data[index] #bt sampling of data
quantile(z, .1) #10th quantile estimation of bt sample

## 10% 
##   2

##  10% 
## 11.8
btest=NULL;B=1000;set.seed(123)
for (i in 1:B) {
  index<-sample(1:length(data), rep=TRUE)
  z=data[index]
  btest[i]=quantile(z, .1)     
}
se=sd(btest)  #SE of quantile
se # 0.5017864

## [1] 0.7789935

## [1] 0.5017864

t test

# --------------- t test  --------------------

t.test(Boston$medv)

## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

## 
##  One Sample t-test
## 
## data:  Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  21.72953 23.33608
## sample estimates:
## mean of x 
##  22.53281

Median

# --------------- Median -----------------

median.medv=median(data)

Calculate a Percentile

# --------------- Calculate a Percentile ---------------

quant.medv=quantile(data, .1) #print 10th percentile

Change numeric variables to factor variables

# --------------- Change numeric variables to factor variables -------------------

library(ElemStatLearn)
data = SAheart

saHeart <- SAheart # Make new data so as to not corrupt original data
saHeart$chd <- factor(SAheart$chd, labels = c("No", "Yes")) # 0 = "No", 1 = "Yes" because goes in order least to greatest
dim(saHeart) # 462   10

## [1] 462  10

summary(saHeart)

##       sbp           tobacco             ldl           adiposity    
##  Min.   :101.0   Min.   : 0.0000   Min.   : 0.980   Min.   : 6.74  
##  1st Qu.:124.0   1st Qu.: 0.0525   1st Qu.: 3.283   1st Qu.:19.77  
##  Median :134.0   Median : 2.0000   Median : 4.340   Median :26.11  
##  Mean   :138.3   Mean   : 3.6356   Mean   : 4.740   Mean   :25.41  
##  3rd Qu.:148.0   3rd Qu.: 5.5000   3rd Qu.: 5.790   3rd Qu.:31.23  
##  Max.   :218.0   Max.   :31.2000   Max.   :15.330   Max.   :42.49  
##     famhist        typea         obesity         alcohol      
##  Absent :270   Min.   :13.0   Min.   :14.70   Min.   :  0.00  
##  Present:192   1st Qu.:47.0   1st Qu.:22.98   1st Qu.:  0.51  
##                Median :53.0   Median :25.80   Median :  7.51  
##                Mean   :53.1   Mean   :26.04   Mean   : 17.04  
##                3rd Qu.:60.0   3rd Qu.:28.50   3rd Qu.: 23.89  
##                Max.   :78.0   Max.   :46.58   Max.   :147.19  
##       age         chd     
##  Min.   :15.00   No :302  
##  1st Qu.:31.00   Yes:160  
##  Median :45.00            
##  Mean   :42.82            
##  3rd Qu.:55.00            
##  Max.   :64.00

step AIC using MASS

# --------------- step AIC using MASS -----------------
data(package="ElemStatLearn")
?SAheart

## starting httpd help server ...

##  done

# The data SAheart is South African Hearth Disease Data.
# The variable chd is response variable with 1: coronary heart disease and 0: no. 

library(MASS) # has the stepAIC function
data=saHeart
saSample = sample(1:462, .7*462)
chdLM <- glm(saHeart$chd ~ ., family = binomial, data = saHeart) # set general linear model with all variables
stepChdLM <- stepAIC(chdLM, trace = T) # Looks for lowest AIC

## Start:  AIC=492.14
## saHeart$chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + 
##     obesity + alcohol + age
## 
##             Df Deviance    AIC
## - alcohol    1   472.14 490.14
## - adiposity  1   472.55 490.55
## - sbp        1   473.44 491.44
## <none>           472.14 492.14
## - obesity    1   474.23 492.23
## - ldl        1   481.07 499.07
## - tobacco    1   481.67 499.67
## - typea      1   483.05 501.05
## - age        1   486.53 504.53
## - famhist    1   488.89 506.89
## 
## Step:  AIC=490.14
## saHeart$chd ~ sbp + tobacco + ldl + adiposity + famhist + typea + 
##     obesity + age
## 
##             Df Deviance    AIC
## - adiposity  1   472.55 488.55
## - sbp        1   473.47 489.47
## <none>           472.14 490.14
## - obesity    1   474.24 490.24
## - ldl        1   481.15 497.15
## - tobacco    1   482.06 498.06
## - typea      1   483.06 499.06
## - age        1   486.64 502.64
## - famhist    1   488.99 504.99
## 
## Step:  AIC=488.55
## saHeart$chd ~ sbp + tobacco + ldl + famhist + typea + obesity + 
##     age
## 
##           Df Deviance    AIC
## - sbp      1   473.98 487.98
## <none>         472.55 488.55
## - obesity  1   474.65 488.65
## - tobacco  1   482.54 496.54
## - ldl      1   482.95 496.95
## - typea    1   483.19 497.19
## - famhist  1   489.38 503.38
## - age      1   495.48 509.48
## 
## Step:  AIC=487.98
## saHeart$chd ~ tobacco + ldl + famhist + typea + obesity + age
## 
##           Df Deviance    AIC
## - obesity  1   475.69 487.69
## <none>         473.98 487.98
## - tobacco  1   484.18 496.18
## - typea    1   484.30 496.30
## - ldl      1   484.53 496.53
## - famhist  1   490.58 502.58
## - age      1   502.11 514.11
## 
## Step:  AIC=487.69
## saHeart$chd ~ tobacco + ldl + famhist + typea + age
## 
##           Df Deviance    AIC
## <none>         475.69 487.69
## - ldl      1   484.71 494.71
## - typea    1   485.44 495.44
## - tobacco  1   486.03 496.03
## - famhist  1   492.09 502.09
## - age      1   502.38 512.38

Split data into test and train

# --------------- split data into test and train -----------------
library(ISLR)
data("College")
n=nrow(College)
trainIndex = sample(1:n, size = round(0.7*n), replace = FALSE)
train = College[trainIndex,]
test = College[-trainIndex,]

head(train)

##                           Private Apps Accept Enroll Top10perc Top25perc
## Virginia State University      No 2996   2440    704         2        30
## Eureka College                Yes  560    454    113        36        56
## Presbyterian College          Yes 1082    832    302        34        63
## Hillsdale College             Yes  920    745    347        35        66
## Hanover College               Yes 1006    837    317        33        65
## Simpson College               Yes 1016    872    300        27        57
##                           F.Undergrad P.Undergrad Outstate Room.Board
## Virginia State University        3006         338     5587       4845
## Eureka College                    484          16    10955       3450
## Presbyterian College             1133          30    11859       3635
## Hillsdale College                1133          42    11090       4700
## Hanover College                  1024          15     8200       3485
## Simpson College                  1116         602    11250       4980
##                           Books Personal PhD Terminal S.F.Ratio
## Virginia State University   500      600  61       63      16.0
## Eureka College              330      670  62       87      10.6
## Presbyterian College        554     1429  80       85      13.4
## Hillsdale College           400      750  80       80      12.0
## Hanover College             500     1200  84       84      10.6
## Simpson College             550     1400  66       73      15.8
##                           perc.alumni Expend Grad.Rate
## Virginia State University          11   5733        31
## Eureka College                     31   9552        53
## Presbyterian College               42   8354        85
## Hillsdale College                  31  12639        79
## Hanover College                    26   9248        64
## Simpson College                    36   7411        70

head(test)

##                           Private Apps Accept Enroll Top10perc Top25perc
## Agnes Scott College           Yes  417    349    137        60        89
## Alaska Pacific University     Yes  193    146     55        16        44
## Albertson College             Yes  587    479    158        38        62
## Albright College              Yes 1038    839    227        30        63
## Alfred University             Yes 1732   1425    472        37        75
## Allegheny College             Yes 2652   1900    484        44        77
##                           F.Undergrad P.Undergrad Outstate Room.Board
## Agnes Scott College               510          63    12960       5450
## Alaska Pacific University         249         869     7560       4120
## Albertson College                 678          41    13500       3335
## Albright College                  973         306    15595       4400
## Alfred University                1830         110    16548       5406
## Allegheny College                1707          44    17080       4440
##                           Books Personal PhD Terminal S.F.Ratio
## Agnes Scott College         450      875  92       97       7.7
## Alaska Pacific University   800     1500  76       72      11.9
## Albertson College           500      675  67       73       9.4
## Albright College            300      500  79       84      11.3
## Alfred University           500      600  82       88      11.3
## Allegheny College           400      600  73       91       9.9
##                           perc.alumni Expend Grad.Rate
## Agnes Scott College                37  19016        59
## Alaska Pacific University           2  10922        15
## Albertson College                  11   9727        55
## Albright College                   23  11644        80
## Alfred University                  31  10932        73
## Allegheny College                  41  11711        76

Another method to split data into test and train

# --------------- alternate method to split data into test and train --------------- 

library(caTools) # for splitting the dataset
library(ElemStatLearn);data(spam) # spam data
# create test and train datasets
split = sample.split(spam$spam, SplitRatio = 2/3)
training_set = subset(spam, split == TRUE) # 3068  58
test_set = subset(spam, split == FALSE) # 1533  58

# ---------------

FIN