library(sas7bdat)
fhs <- read.sas7bdat("C:/Users/miche/Downloads/fhs.sas7bdat")
dim(fhs) #11627 13
## [1] 11627 13
str(fhs)
## 'data.frame': 11627 obs. of 13 variables:
## $ SEX : num 2 2 2 2 2 2 2 2 2 2 ...
## $ RANDID : num 1080920 8303090 5807368 610021 5080716 ...
## $ TOTCHOL : num 253 261 NaN 208 246 298 299 264 169 239 ...
## $ AGE : num 64 67 75 66 78 62 68 59 48 66 ...
## $ SYSBP : num 295 282 267 265 254 248 246 244 243 243 ...
## $ DIABP : num 135 112 105 150 92 ...
## $ DIABETES: num 0 0 0 1 0 0 1 0 0 0 ...
## $ BPMEDS : num 0 1 0 0 9 1 1 1 0 0 ...
## $ PERIOD : num 1 2 3 2 3 1 3 1 1 2 ...
## $ CIGPDAY : num 0 0 20 0 0 0 9 0 0 0 ...
## $ HEARTRTE: num 92 90 90 80 60 96 112 76 85 90 ...
## $ HDLC : num NaN NaN NaN NaN 39 NaN 24 NaN NaN NaN ...
## $ LDLC : num NaN NaN NaN NaN 119 NaN 275 NaN NaN NaN ...
## - attr(*, "pkg.version")= chr "0.5"
## - attr(*, "column.info")=List of 13
## ..$ :List of 10
## .. ..$ name : chr "SEX"
## .. ..$ offset: int 96
## .. ..$ length: int 5
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 11
## .. ..$ name : chr "RANDID"
## .. ..$ offset: int 0
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ label : chr "Random ID"
## .. ..$ lhdr : int 0
## .. ..$ loff : int 48
## .. ..$ llen : int 9
## ..$ :List of 10
## .. ..$ name : chr "TOTCHOL"
## .. ..$ offset: int 8
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 10
## .. ..$ name : chr "AGE"
## .. ..$ offset: int 16
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 10
## .. ..$ name : chr "SYSBP"
## .. ..$ offset: int 24
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 10
## .. ..$ name : chr "DIABP"
## .. ..$ offset: int 32
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 10
## .. ..$ name : chr "DIABETES"
## .. ..$ offset: int 40
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 10
## .. ..$ name : chr "BPMEDS"
## .. ..$ offset: int 48
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ lhdr : int 0
## .. ..$ loff : int 0
## .. ..$ llen : int 0
## ..$ :List of 11
## .. ..$ name : chr "PERIOD"
## .. ..$ offset: int 56
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ label : chr "Examination cycle"
## .. ..$ lhdr : int 0
## .. ..$ loff : int 112
## .. ..$ llen : int 17
## ..$ :List of 11
## .. ..$ name : chr "CIGPDAY"
## .. ..$ offset: int 64
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ label : chr "Cigarettes per day"
## .. ..$ lhdr : int 0
## .. ..$ loff : int 140
## .. ..$ llen : int 18
## ..$ :List of 11
## .. ..$ name : chr "HEARTRTE"
## .. ..$ offset: int 72
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ label : chr "Ventricular Rate (beats/min)"
## .. ..$ lhdr : int 0
## .. ..$ loff : int 168
## .. ..$ llen : int 28
## ..$ :List of 11
## .. ..$ name : chr "HDLC"
## .. ..$ offset: int 80
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ label : chr "HDL Cholesterol mg/dL"
## .. ..$ lhdr : int 0
## .. ..$ loff : int 200
## .. ..$ llen : int 21
## ..$ :List of 11
## .. ..$ name : chr "LDLC"
## .. ..$ offset: int 88
## .. ..$ length: int 8
## .. ..$ type : chr "numeric"
## .. ..$ fhdr : int 0
## .. ..$ foff : int 0
## .. ..$ flen : int 0
## .. ..$ label : chr "LDL Cholesterol mg/dL"
## .. ..$ lhdr : int 0
## .. ..$ loff : int 228
## .. ..$ llen : int 21
## - attr(*, "date.created")= POSIXct, format: "2018-09-11 15:46:28"
## - attr(*, "date.modified")= POSIXct, format: "2018-09-11 15:46:28"
## - attr(*, "SAS.release")= chr "9.0401M5"
## - attr(*, "SAS.host")= chr "Linux"
## - attr(*, "OS.version")= chr "3.10.0-693.21.1."
## - attr(*, "OS.maker")= chr ""
## - attr(*, "OS.name")= chr "x86_64"
## - attr(*, "endian")= chr "little"
## - attr(*, "winunix")= chr "windows"
names(fhs)
## [1] "SEX" "RANDID" "TOTCHOL" "AGE" "SYSBP" "DIABP"
## [7] "DIABETES" "BPMEDS" "PERIOD" "CIGPDAY" "HEARTRTE" "HDLC"
## [13] "LDLC"
#Note [] extracts a list, [[]] extracts 'element(s)' within the list.
Male<-fhs[which(fhs$SEX==1),]
# or
Male<-subset(fhs,fhs$SEX==1)
#Histogram
hist(Male$SYSBP)
#Scatterplot
plot(Male$SYSBP~Male$AGE)
#Boxplot
boxplot(fhs$SYSBP~fhs$SEX)
#Plot by group using ifelse function
plotcolor=ifelse(fhs$SEX==1,"black","red")
plot(fhs$SYSBP~fhs$AGE,col=plotcolor,pch=20,cex=1)
############################################## # Statistical tests & diagnostic plots # ##############################################
#Regression
model1<-lm(fhs$SYSBP~fhs$AGE+fhs$SEX)
summary(model1) #sex - cont.
##
## Call:
## lm(formula = fhs$SYSBP ~ fhs$AGE + fhs$SEX)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.565 -14.189 -2.467 11.419 149.420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.95926 1.27463 65.085 < 2e-16 ***
## fhs$AGE 0.92487 0.02036 45.428 < 2e-16 ***
## fhs$SEX 1.71463 0.39308 4.362 1.3e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.99 on 11624 degrees of freedom
## Multiple R-squared: 0.1527, Adjusted R-squared: 0.1526
## F-statistic: 1048 on 2 and 11624 DF, p-value: < 2.2e-16
#example of extracting a t value: age
tvalue <- summary(model1)$coef[2,3:4]
print(tvalue)
## t value Pr(>|t|)
## 45.4283 0.0000
#Regression with Interaction
model2<-lm(fhs$SYSBP~fhs$AGE*fhs$SEX)
summary(model2)
##
## Call:
## lm(formula = fhs$SYSBP ~ fhs$AGE * fhs$SEX)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.281 -14.195 -2.602 11.233 147.160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 133.531558 3.725307 35.844 <2e-16 ***
## fhs$AGE -0.000739 0.067242 -0.011 0.991
## fhs$SEX -30.494454 2.265754 -13.459 <2e-16 ***
## fhs$AGE:fhs$SEX 0.588629 0.040790 14.431 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.8 on 11623 degrees of freedom
## Multiple R-squared: 0.1676, Adjusted R-squared: 0.1674
## F-statistic: 780.2 on 3 and 11623 DF, p-value: < 2.2e-16
summary.aov(model2)
## Df Sum Sq Mean Sq F value Pr(>F)
## fhs$AGE 1 914470 914470 2113.11 < 2e-16 ***
## fhs$SEX 1 8381 8381 19.37 1.09e-05 ***
## fhs$AGE:fhs$SEX 1 90118 90118 208.24 < 2e-16 ***
## Residuals 11623 5029961 433
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# note that it treats SEX as a continuous variable even though we want it to be a categorical variable
fhs$SEX<-as.factor(fhs$SEX)
model2<-lm(fhs$SYSBP~fhs$AGE*fhs$SEX)
summary(model2)
##
## Call:
## lm(formula = fhs$SYSBP ~ fhs$AGE * fhs$SEX)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.281 -14.195 -2.602 11.233 147.160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 103.03710 1.70727 60.35 <2e-16 ***
## fhs$AGE 0.58789 0.03086 19.05 <2e-16 ***
## fhs$SEX2 -30.49445 2.26575 -13.46 <2e-16 ***
## fhs$AGE:fhs$SEX2 0.58863 0.04079 14.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.8 on 11623 degrees of freedom
## Multiple R-squared: 0.1676, Adjusted R-squared: 0.1674
## F-statistic: 780.2 on 3 and 11623 DF, p-value: < 2.2e-16
summary.aov(model2)
## Df Sum Sq Mean Sq F value Pr(>F)
## fhs$AGE 1 914470 914470 2113.11 < 2e-16 ***
## fhs$SEX 1 8381 8381 19.37 1.09e-05 ***
## fhs$AGE:fhs$SEX 1 90118 90118 208.24 < 2e-16 ***
## Residuals 11623 5029961 433
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Diagnostc plots
plot(model2)
#select
plot(model2,which=1)
plot(model2,which=2)
plot(model2,which=3)
plot(model2,which=4)
############################################## # Vectors and Matrices # ##############################################
# Vectors
# Array
x<-c(1,2,3,4,5,6)
# A vector is a series of values, all of the same type
is.vector(x)
## [1] TRUE
x
## [1] 1 2 3 4 5 6
x[3]
## [1] 3
length(x)
## [1] 6
#It is sometimes better to define the dimensions of the vector before you use it
x2<-vector(length=5)
x2 #logical
## [1] FALSE FALSE FALSE FALSE FALSE
x2[3]=1 #binary code
x2
## [1] 0 0 1 0 0
x3=x2[-3]
x3
## [1] 0 0 0 0
#Matrix
x4<-matrix(1:6,ncol=3)
x4
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
#Transpose
t(x4)
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
## [3,] 5 6
#Inversion (square matrix, 2x2)
x3<-x4[,-3] #take out the third column
x3
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
x7<- solve(x3) #inverse
print(x3 %*% x7) #identity
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
#Simulate Limear regression
one <-rep(1,100) #1 for 100 times
head(one)
## [1] 1 1 1 1 1 1
pred1<-rnorm(100)
head(pred1)
## [1] -0.08963242 0.57614622 0.62662294 -0.09309603 0.63811626 -1.47789589
X <- cbind(one,pred1)
head(X, n=3)
## one pred1
## [1,] 1 -0.08963242
## [2,] 1 0.57614622
## [3,] 1 0.62662294
#rnorm(n, mean = 0, sd = 1)
y=3*one+2*pred1+(rnorm(100)*sqrt(.25)+1) #with mean of 1
y=3*one+2*pred1+(rnorm(100)*sqrt(.25))#standard mean 0 with standard deviation of 0.25
head(y)
## [1] 2.8129684 4.2289147 4.4362017 3.3476747 3.7685168 0.1805651
coef(lm(y~pred1))
## (Intercept) pred1
## 2.902379 1.973771
solve(t(X)%*%X)%*%t(X)%*%y
## [,1]
## one 2.902379
## pred1 1.973771
#Extract data
#write.csv(cbind(y,X),"C:/Users/miche/Downloads/fromR.csv")
df1<-data.frame(x=rnorm(2),y=rexp(2))
df1
## x y
## 1 -0.1820836 3.104689
## 2 2.1446123 1.935409
df2<-data.frame(x=rpois(2,2),y=runif(2))
df2
## x y
## 1 2 0.7177619
## 2 2 0.3966144
df3<-rbind(df1,df2)
df3
## x y
## 1 -0.1820836 3.1046892
## 2 2.1446123 1.9354095
## 3 2.0000000 0.7177619
## 4 2.0000000 0.3966144
df4<-data.frame(id=(1:4),x=rgeom(4,.5))
df4
## id x
## 1 1 1
## 2 2 4
## 3 3 0
## 4 4 0
df5<-data.frame(id=(1:4),y=rbinom(4,3,.5))
df5
## id y
## 1 1 1
## 2 2 1
## 3 3 3
## 4 4 0
df6=merge(df4,df5,by="id")
df6
## id x y
## 1 1 1 1
## 2 2 4 1
## 3 3 0 3
## 4 4 0 0
df7<-data.frame(patid=1:5,z=rt(5,10)) #5 obs with mean of 10 from t distribution
df7
## patid z
## 1 1 0.23141253
## 2 2 -0.47581243
## 3 3 0.78186018
## 4 4 -0.62837861
## 5 5 -0.04752034
df8<-merge(x=df6,y=df7,by.x="id",by.y="patid")
df8
## id x y z
## 1 1 1 1 0.2314125
## 2 2 4 1 -0.4758124
## 3 3 0 3 0.7818602
## 4 4 0 0 -0.6283786
df9<-merge(x=df6,y=df7,all=TRUE,by.x="id",by.y="patid")
df9
## id x y z
## 1 1 1 1 0.23141253
## 2 2 4 1 -0.47581243
## 3 3 0 3 0.78186018
## 4 4 0 0 -0.62837861
## 5 5 NA NA -0.04752034
#rbind is set in sas
#merge is merge in sas
#no need to sort in r
#If then statements
if(mean(fhs$SYSBP,na.rm=TRUE)>140)
print("Hypertensive sample") else print("non-hypertensive sample")
## [1] "non-hypertensive sample"
HT<-ifelse(fhs$SYSBP>140 | fhs$DIABP>80, 1, 0)
# Do loops
len=10
fibvals=numeric(len)
str(len)
## num 10
str(fibvals)
## num [1:10] 0 0 0 0 0 0 0 0 0 0
fibvals[1]=1
fibvals[2]=1
fibvals
## [1] 1 1 0 0 0 0 0 0 0 0
for(i in 3:len)
{
fibvals[i]=fibvals[i-1]+fibvals[i-1]
}
fibvals #sum of the previous 2
## [1] 1 1 2 4 8 16 32 64 128 256
#while
mysum=0
counter=0
while(mysum<10)
{
mysum=mysum+rexp(1)
counter=counter+1
}
mysum
## [1] 10.18687
counter
## [1] 10
#optimization
mysum =0
counter =0
while(counter<10)
{
mysum=mysum+rexp(1)
counter=counter+1
}
mysum
## [1] 10.80217
counter
## [1] 10
# and is &
# or is |
# not is !
#Long to wide
fhs.sort <- fhs[order(fhs$RANDID),]
fhs_wide <- reshape(fhs.sort,
timevar = "PERIOD",
idvar = c("RANDID", "SEX"),
direction = "wide")
#Wide to long
fhs_long <- reshape(fhs_wide,
varying = c("SYSBP.1", "SYSBP.2", "SYSBP.3"),
v.names = "SYSBP",
timevar = "TIME",
times = c("1", "2", "3"),
direction = "long")
fhs_long<-fhs_long[order(fhs_long$RANDID),]
#USER-DEFINED FUNCTIONS
#myfunction <- function(arg1, arg2, ... ){
# statements
# return(object)
#}
# function example - get measures of central tendency
# and spread for a numeric vector x. The user has a
# choice of measures and whether the results are printed.
mysummary <- function(x,npar=TRUE,print=TRUE) {
if (!npar) {
center <- mean(x); spread <- sd(x)
} else {
center <- median(x); spread <- mad(x)
}
if (print & !npar) {
cat("Mean=", center, "\n", "SD=", spread, "\n")
} else if (print & npar) {
cat("Median=", center, "\n", "MAD=", spread, "\n")
} #\n = next line
result <- list(center=center,spread=spread)
return(result)
}
# invoking the function
set.seed(1234)
x <- rpois(500, 4)
y <- mysummary(x)
## Median= 4
## MAD= 1.4826
Median= 4
MAD= 1.4826
y$center # is the median (4)
## [1] 4
y$spread # is the median absolute deviation (1.4826)
## [1] 1.4826
y <- mysummary(x, npar=FALSE, print=FALSE)
# no output
y$center #is the mean (4.052)
## [1] 4.052
y$spread #is the standard deviation (2.01927)
## [1] 2.01927
#R-functions
solveols<-function(y,x){
bh<-solve(t(x)%*%x)%*%(t(x)%*%y)
return(bh)
}
#solveols(y,x)
#R-functions-add residual variance
solveols<-function(y,x){
bh<-solve(t(x)%*%x)%*%(t(x)%*%y)
colnames(bh)="Estimates"
sigmasq<-sum((y-x%*%bh)^2)/(length(y)-2)
varbh=sigmasq*solve(t(x)%*%x)
return(list(bh,s=sqrt(sigmasq),varcovbetahat=varbh))
}
#solveols(y,X)
#solveols(y,X)$varcovbetahat