#This sets the working directory
setwd("/Users/sbhaumik/Documents/MPHCourses/RforHealthScience ")
#Import Framingham dataset
library(readxl)
dat <- read_excel("/Users/sbhaumik/Downloads/frmgham2-3.xls")
view(dat)
#Familiarize yourself with dataset
head(dat)
## # A tibble: 6 × 39
## RANDID SEX TOTCHOL AGE SYSBP DIABP CURSMOKE CIGPDAY BMI DIABETES BPMEDS
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2448 1 195 39 106 70 0 0 27.0 0 0
## 2 2448 1 209 52 121 66 0 0 NA 0 0
## 3 6238 2 250 46 121 81 0 0 28.7 0 0
## 4 6238 2 260 52 105 69.5 0 0 29.4 0 0
## 5 6238 2 237 58 108 66 0 0 28.5 0 0
## 6 9428 1 245 48 128. 80 1 20 25.3 0 0
## # ℹ 28 more variables: HEARTRTE <dbl>, GLUCOSE <dbl>, educ <dbl>,
## # PREVCHD <dbl>, PREVAP <dbl>, PREVMI <dbl>, PREVSTRK <dbl>, PREVHYP <dbl>,
## # TIME <dbl>, PERIOD <dbl>, HDLC <dbl>, LDLC <dbl>, DEATH <dbl>,
## # ANGINA <dbl>, HOSPMI <dbl>, MI_FCHD <dbl>, ANYCHD <dbl>, STROKE <dbl>,
## # CVD <dbl>, HYPERTEN <dbl>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## # TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>, TIMEHYP <dbl>
ncol(dat) #There are 39 columns, or variables, in the dataset
## [1] 39
nrow(dat) #There are 11627 rows, or observations, in the dataset
## [1] 11627
#Check variable types
class(dat$CURSMOKE)
## [1] "numeric"
class(dat$PERIOD)
## [1] "numeric"
class(dat$HEARTRTE)
## [1] "numeric"
#We see that cursmoke, period and heartrte are all numeric variables
#How many unique participants?
a = filter(dat, dat$PERIOD==1)
nrow(a)
## [1] 4434
#New variable for natural log of glucose
dat$lngluc = log(dat$GLUCOSE)
#New variable for BMI category
dat$bmicat <- as.factor(ifelse(dat$BMI < 18.5, 'Underweight',
ifelse(dat$BMI < 25, 'Normal',
ifelse(dat$BMI < 30, 'Overweight',
ifelse (dat$BMI < 35, 'Obesity Class I',
ifelse (dat$BMI < 40, 'Obesity Class II',
ifelse(dat$BMI >= 40, 'Obesity Class III',0)))))))
#Participants with missing data for BMI are marked as NA in this classification
#Restricting dataset to period 3
p3 = filter(dat, dat$PERIOD==3)
nrow(p3)
## [1] 3263
ncol(p3)
## [1] 41
#There are 3263 cases and 41 variables
#Export period 3 dataset
write.csv(p3, file="period3.csv")