Read in SAS dataset using sas7bdat package

library(sas7bdat)

  fhs <- read.sas7bdat("C:/Users/miche/Downloads/fhs.sas7bdat")
    dim(fhs) #11627    13

## [1] 11627    13

    str(fhs)

## 'data.frame':    11627 obs. of  13 variables:
##  $ SEX     : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ RANDID  : num  1080920 8303090 5807368 610021 5080716 ...
##  $ TOTCHOL : num  253 261 NaN 208 246 298 299 264 169 239 ...
##  $ AGE     : num  64 67 75 66 78 62 68 59 48 66 ...
##  $ SYSBP   : num  295 282 267 265 254 248 246 244 243 243 ...
##  $ DIABP   : num  135 112 105 150 92 ...
##  $ DIABETES: num  0 0 0 1 0 0 1 0 0 0 ...
##  $ BPMEDS  : num  0 1 0 0 9 1 1 1 0 0 ...
##  $ PERIOD  : num  1 2 3 2 3 1 3 1 1 2 ...
##  $ CIGPDAY : num  0 0 20 0 0 0 9 0 0 0 ...
##  $ HEARTRTE: num  92 90 90 80 60 96 112 76 85 90 ...
##  $ HDLC    : num  NaN NaN NaN NaN 39 NaN 24 NaN NaN NaN ...
##  $ LDLC    : num  NaN NaN NaN NaN 119 NaN 275 NaN NaN NaN ...
##  - attr(*, "pkg.version")= chr "0.5"
##  - attr(*, "column.info")=List of 13
##   ..$ :List of 10
##   .. ..$ name  : chr "SEX"
##   .. ..$ offset: int 96
##   .. ..$ length: int 5
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 11
##   .. ..$ name  : chr "RANDID"
##   .. ..$ offset: int 0
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ label : chr "Random ID"
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 48
##   .. ..$ llen  : int 9
##   ..$ :List of 10
##   .. ..$ name  : chr "TOTCHOL"
##   .. ..$ offset: int 8
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 10
##   .. ..$ name  : chr "AGE"
##   .. ..$ offset: int 16
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 10
##   .. ..$ name  : chr "SYSBP"
##   .. ..$ offset: int 24
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 10
##   .. ..$ name  : chr "DIABP"
##   .. ..$ offset: int 32
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 10
##   .. ..$ name  : chr "DIABETES"
##   .. ..$ offset: int 40
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 10
##   .. ..$ name  : chr "BPMEDS"
##   .. ..$ offset: int 48
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 0
##   .. ..$ llen  : int 0
##   ..$ :List of 11
##   .. ..$ name  : chr "PERIOD"
##   .. ..$ offset: int 56
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ label : chr "Examination cycle"
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 112
##   .. ..$ llen  : int 17
##   ..$ :List of 11
##   .. ..$ name  : chr "CIGPDAY"
##   .. ..$ offset: int 64
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ label : chr "Cigarettes per day"
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 140
##   .. ..$ llen  : int 18
##   ..$ :List of 11
##   .. ..$ name  : chr "HEARTRTE"
##   .. ..$ offset: int 72
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ label : chr "Ventricular Rate (beats/min)"
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 168
##   .. ..$ llen  : int 28
##   ..$ :List of 11
##   .. ..$ name  : chr "HDLC"
##   .. ..$ offset: int 80
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ label : chr "HDL Cholesterol mg/dL"
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 200
##   .. ..$ llen  : int 21
##   ..$ :List of 11
##   .. ..$ name  : chr "LDLC"
##   .. ..$ offset: int 88
##   .. ..$ length: int 8
##   .. ..$ type  : chr "numeric"
##   .. ..$ fhdr  : int 0
##   .. ..$ foff  : int 0
##   .. ..$ flen  : int 0
##   .. ..$ label : chr "LDL Cholesterol mg/dL"
##   .. ..$ lhdr  : int 0
##   .. ..$ loff  : int 228
##   .. ..$ llen  : int 21
##  - attr(*, "date.created")= POSIXct, format: "2018-09-11 15:46:28"
##  - attr(*, "date.modified")= POSIXct, format: "2018-09-11 15:46:28"
##  - attr(*, "SAS.release")= chr "9.0401M5"
##  - attr(*, "SAS.host")= chr "Linux"
##  - attr(*, "OS.version")= chr "3.10.0-693.21.1."
##  - attr(*, "OS.maker")= chr ""
##  - attr(*, "OS.name")= chr "x86_64"
##  - attr(*, "endian")= chr "little"
##  - attr(*, "winunix")= chr "windows"

    names(fhs)

##  [1] "SEX"      "RANDID"   "TOTCHOL"  "AGE"      "SYSBP"    "DIABP"   
##  [7] "DIABETES" "BPMEDS"   "PERIOD"   "CIGPDAY"  "HEARTRTE" "HDLC"    
## [13] "LDLC"

subset/filter/(or use dyplyr )

#Note [] extracts a list, [[]] extracts 'element(s)' within the list. 
  Male<-fhs[which(fhs$SEX==1),]
# or 
  Male<-subset(fhs,fhs$SEX==1)

Graphs

#Histogram
hist(Male$SYSBP)

#Scatterplot
plot(Male$SYSBP~Male$AGE)

#Boxplot
boxplot(fhs$SYSBP~fhs$SEX)

#Plot by group using ifelse function
plotcolor=ifelse(fhs$SEX==1,"black","red")
plot(fhs$SYSBP~fhs$AGE,col=plotcolor,pch=20,cex=1)

############################################## # Statistical tests & diagnostic plots # ##############################################

#Regression
model1<-lm(fhs$SYSBP~fhs$AGE+fhs$SEX)
  summary(model1) #sex - cont.

## 
## Call:
## lm(formula = fhs$SYSBP ~ fhs$AGE + fhs$SEX)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.565 -14.189  -2.467  11.419 149.420 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 82.95926    1.27463  65.085  < 2e-16 ***
## fhs$AGE      0.92487    0.02036  45.428  < 2e-16 ***
## fhs$SEX      1.71463    0.39308   4.362  1.3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.99 on 11624 degrees of freedom
## Multiple R-squared:  0.1527, Adjusted R-squared:  0.1526 
## F-statistic:  1048 on 2 and 11624 DF,  p-value: < 2.2e-16

  #example of extracting a t value: age
    tvalue <- summary(model1)$coef[2,3:4] 
      print(tvalue)

##  t value Pr(>|t|) 
##  45.4283   0.0000

#Regression with Interaction
model2<-lm(fhs$SYSBP~fhs$AGE*fhs$SEX)
  summary(model2)

## 
## Call:
## lm(formula = fhs$SYSBP ~ fhs$AGE * fhs$SEX)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.281 -14.195  -2.602  11.233 147.160 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     133.531558   3.725307  35.844   <2e-16 ***
## fhs$AGE          -0.000739   0.067242  -0.011    0.991    
## fhs$SEX         -30.494454   2.265754 -13.459   <2e-16 ***
## fhs$AGE:fhs$SEX   0.588629   0.040790  14.431   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.8 on 11623 degrees of freedom
## Multiple R-squared:  0.1676, Adjusted R-squared:  0.1674 
## F-statistic: 780.2 on 3 and 11623 DF,  p-value: < 2.2e-16

  summary.aov(model2)

##                    Df  Sum Sq Mean Sq F value   Pr(>F)    
## fhs$AGE             1  914470  914470 2113.11  < 2e-16 ***
## fhs$SEX             1    8381    8381   19.37 1.09e-05 ***
## fhs$AGE:fhs$SEX     1   90118   90118  208.24  < 2e-16 ***
## Residuals       11623 5029961     433                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# note that it treats SEX as a continuous variable even though we want it to be a categorical variable
fhs$SEX<-as.factor(fhs$SEX)

model2<-lm(fhs$SYSBP~fhs$AGE*fhs$SEX)
  summary(model2)

## 
## Call:
## lm(formula = fhs$SYSBP ~ fhs$AGE * fhs$SEX)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.281 -14.195  -2.602  11.233 147.160 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      103.03710    1.70727   60.35   <2e-16 ***
## fhs$AGE            0.58789    0.03086   19.05   <2e-16 ***
## fhs$SEX2         -30.49445    2.26575  -13.46   <2e-16 ***
## fhs$AGE:fhs$SEX2   0.58863    0.04079   14.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.8 on 11623 degrees of freedom
## Multiple R-squared:  0.1676, Adjusted R-squared:  0.1674 
## F-statistic: 780.2 on 3 and 11623 DF,  p-value: < 2.2e-16

  summary.aov(model2)

##                    Df  Sum Sq Mean Sq F value   Pr(>F)    
## fhs$AGE             1  914470  914470 2113.11  < 2e-16 ***
## fhs$SEX             1    8381    8381   19.37 1.09e-05 ***
## fhs$AGE:fhs$SEX     1   90118   90118  208.24  < 2e-16 ***
## Residuals       11623 5029961     433                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Diagnostc plots
plot(model2)

  #select 
  plot(model2,which=1)

  plot(model2,which=2)

  plot(model2,which=3)

  plot(model2,which=4)

############################################## # Vectors and Matrices # ##############################################

# Vectors
# Array
x<-c(1,2,3,4,5,6)
# A vector is a series of values, all of the same type
  is.vector(x)

## [1] TRUE

## [1] 1 2 3 4 5 6

  x[3]

## [1] 3

length(x)

## [1] 6

#It is sometimes better to define the dimensions of the vector before you use it
  x2<-vector(length=5)
  x2 #logical

## [1] FALSE FALSE FALSE FALSE FALSE

  x2[3]=1 #binary code
  x2

## [1] 0 0 1 0 0

  x3=x2[-3]
  x3

## [1] 0 0 0 0

#Matrix
  x4<-matrix(1:6,ncol=3)
  x4

##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

#Transpose
  t(x4)

##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
## [3,]    5    6

#Inversion (square matrix, 2x2)
x3<-x4[,-3] #take out the third column
  x3

##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4

x7<- solve(x3) #inverse 
print(x3 %*% x7) #identity

##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

Simulation

#Simulate Limear regression
one <-rep(1,100) #1 for 100 times
  head(one)

## [1] 1 1 1 1 1 1

pred1<-rnorm(100)
  head(pred1)

## [1] -0.08963242  0.57614622  0.62662294 -0.09309603  0.63811626 -1.47789589

X <- cbind(one,pred1)
  head(X, n=3)

##      one       pred1
## [1,]   1 -0.08963242
## [2,]   1  0.57614622
## [3,]   1  0.62662294

#rnorm(n, mean = 0, sd = 1) 
y=3*one+2*pred1+(rnorm(100)*sqrt(.25)+1) #with mean of 1
y=3*one+2*pred1+(rnorm(100)*sqrt(.25))#standard mean 0 with standard deviation of 0.25 
  head(y)

## [1] 2.8129684 4.2289147 4.4362017 3.3476747 3.7685168 0.1805651

coef(lm(y~pred1))

## (Intercept)       pred1 
##    2.902379    1.973771

solve(t(X)%*%X)%*%t(X)%*%y

##           [,1]
## one   2.902379
## pred1 1.973771

#Extract data
#write.csv(cbind(y,X),"C:/Users/miche/Downloads/fromR.csv")

Stacking and Merging

df1<-data.frame(x=rnorm(2),y=rexp(2))
  df1

##            x        y
## 1 -0.1820836 3.104689
## 2  2.1446123 1.935409

df2<-data.frame(x=rpois(2,2),y=runif(2))
  df2

##   x         y
## 1 2 0.7177619
## 2 2 0.3966144

df3<-rbind(df1,df2)
  df3

##            x         y
## 1 -0.1820836 3.1046892
## 2  2.1446123 1.9354095
## 3  2.0000000 0.7177619
## 4  2.0000000 0.3966144

df4<-data.frame(id=(1:4),x=rgeom(4,.5))
  df4

##   id x
## 1  1 1
## 2  2 4
## 3  3 0
## 4  4 0

df5<-data.frame(id=(1:4),y=rbinom(4,3,.5))
  df5

##   id y
## 1  1 1
## 2  2 1
## 3  3 3
## 4  4 0

df6=merge(df4,df5,by="id")
  df6

##   id x y
## 1  1 1 1
## 2  2 4 1
## 3  3 0 3
## 4  4 0 0

df7<-data.frame(patid=1:5,z=rt(5,10)) #5 obs with mean of 10 from t distribution
  df7

##   patid           z
## 1     1  0.23141253
## 2     2 -0.47581243
## 3     3  0.78186018
## 4     4 -0.62837861
## 5     5 -0.04752034

df8<-merge(x=df6,y=df7,by.x="id",by.y="patid")
  df8

##   id x y          z
## 1  1 1 1  0.2314125
## 2  2 4 1 -0.4758124
## 3  3 0 3  0.7818602
## 4  4 0 0 -0.6283786

df9<-merge(x=df6,y=df7,all=TRUE,by.x="id",by.y="patid")
  df9

##   id  x  y           z
## 1  1  1  1  0.23141253
## 2  2  4  1 -0.47581243
## 3  3  0  3  0.78186018
## 4  4  0  0 -0.62837861
## 5  5 NA NA -0.04752034

  #rbind is set in sas
  #merge is merge in sas
  #no need to sort in r

Conditional & Loops

#If then statements

if(mean(fhs$SYSBP,na.rm=TRUE)>140) 
  print("Hypertensive sample")  else print("non-hypertensive sample")

## [1] "non-hypertensive sample"

HT<-ifelse(fhs$SYSBP>140 | fhs$DIABP>80, 1, 0)

# Do loops
len=10
fibvals=numeric(len)
str(len)

##  num 10

str(fibvals)

##  num [1:10] 0 0 0 0 0 0 0 0 0 0

fibvals[1]=1
fibvals[2]=1
fibvals

##  [1] 1 1 0 0 0 0 0 0 0 0

for(i in 3:len)
  {
  fibvals[i]=fibvals[i-1]+fibvals[i-1]
}

fibvals #sum of the previous 2

##  [1]   1   1   2   4   8  16  32  64 128 256

#while
mysum=0
counter=0

while(mysum<10)
  {
  mysum=mysum+rexp(1)
  counter=counter+1
}
mysum

## [1] 10.18687

counter

## [1] 10

#optimization
mysum =0
counter =0
while(counter<10)
  {
  mysum=mysum+rexp(1)
  counter=counter+1
}
mysum

## [1] 10.80217

counter

## [1] 10

# and is &
# or is |
# not is !

Reshape (or reshape package)

#Long to wide
fhs.sort <- fhs[order(fhs$RANDID),]
fhs_wide <- reshape(fhs.sort, 
             timevar = "PERIOD",
             idvar = c("RANDID", "SEX"),
             direction = "wide")
#Wide to long
fhs_long <- reshape(fhs_wide, 
             varying = c("SYSBP.1", "SYSBP.2", "SYSBP.3"), 
             v.names = "SYSBP",
             timevar = "TIME", 
             times = c("1", "2", "3"), 
             direction = "long")
fhs_long<-fhs_long[order(fhs_long$RANDID),]

User defined functions

#USER-DEFINED FUNCTIONS 
  #myfunction <- function(arg1, arg2, ... ){
#  statements
#  return(object)
#}

# function example - get measures of central tendency
# and spread for a numeric vector x. The user has a
# choice of measures and whether the results are printed.

mysummary <- function(x,npar=TRUE,print=TRUE) {
  if (!npar) {
    center <- mean(x); spread <- sd(x) 
  } else {
    center <- median(x); spread <- mad(x) 
  }
  if (print & !npar) {
    cat("Mean=", center, "\n", "SD=", spread, "\n")
  } else if (print & npar) {
    cat("Median=", center, "\n", "MAD=", spread, "\n")
  } #\n = next line
  result <- list(center=center,spread=spread)
  return(result)
}

# invoking the function 
set.seed(1234)
  x <- rpois(500, 4) 
  y <- mysummary(x)

## Median= 4 
##  MAD= 1.4826

Median= 4
MAD= 1.4826 
   y$center # is the median (4)

## [1] 4

   y$spread # is the median absolute deviation (1.4826)

## [1] 1.4826

y <- mysummary(x, npar=FALSE, print=FALSE)
  # no output 
  y$center #is the mean (4.052)

## [1] 4.052

   y$spread #is the standard deviation (2.01927)

## [1] 2.01927

#R-functions
solveols<-function(y,x){
  bh<-solve(t(x)%*%x)%*%(t(x)%*%y)
  return(bh)
}




#solveols(y,x)
#R-functions-add residual variance
solveols<-function(y,x){
  bh<-solve(t(x)%*%x)%*%(t(x)%*%y)
  colnames(bh)="Estimates"
  sigmasq<-sum((y-x%*%bh)^2)/(length(y)-2)
  varbh=sigmasq*solve(t(x)%*%x)
  return(list(bh,s=sqrt(sigmasq),varcovbetahat=varbh))
}
#solveols(y,X)

#solveols(y,X)$varcovbetahat

Intermediate R - Review

Michelle C. Hsu

September 18, 2018