#This sets the working directory
setwd("/Users/sbhaumik/Documents/MPHCourses/RforHealthScience ")
#Import Framingham dataset
library(readxl)
dat <- read_excel("/Users/sbhaumik/Downloads/frmgham2-3.xls")
view(dat)
#Familiarize yourself with dataset
head(dat)
## # A tibble: 6 × 39
##   RANDID   SEX TOTCHOL   AGE SYSBP DIABP CURSMOKE CIGPDAY   BMI DIABETES BPMEDS
##    <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>    <dbl>   <dbl> <dbl>    <dbl>  <dbl>
## 1   2448     1     195    39  106   70          0       0  27.0        0      0
## 2   2448     1     209    52  121   66          0       0  NA          0      0
## 3   6238     2     250    46  121   81          0       0  28.7        0      0
## 4   6238     2     260    52  105   69.5        0       0  29.4        0      0
## 5   6238     2     237    58  108   66          0       0  28.5        0      0
## 6   9428     1     245    48  128.  80          1      20  25.3        0      0
## # ℹ 28 more variables: HEARTRTE <dbl>, GLUCOSE <dbl>, educ <dbl>,
## #   PREVCHD <dbl>, PREVAP <dbl>, PREVMI <dbl>, PREVSTRK <dbl>, PREVHYP <dbl>,
## #   TIME <dbl>, PERIOD <dbl>, HDLC <dbl>, LDLC <dbl>, DEATH <dbl>,
## #   ANGINA <dbl>, HOSPMI <dbl>, MI_FCHD <dbl>, ANYCHD <dbl>, STROKE <dbl>,
## #   CVD <dbl>, HYPERTEN <dbl>, TIMEAP <dbl>, TIMEMI <dbl>, TIMEMIFC <dbl>,
## #   TIMECHD <dbl>, TIMESTRK <dbl>, TIMECVD <dbl>, TIMEDTH <dbl>, TIMEHYP <dbl>
ncol(dat) #There are 39 columns, or variables, in the dataset
## [1] 39
nrow(dat) #There are 11627 rows, or observations, in the dataset
## [1] 11627
#Check variable types
class(dat$CURSMOKE)
## [1] "numeric"
class(dat$PERIOD)
## [1] "numeric"
class(dat$HEARTRTE)
## [1] "numeric"
#We see that cursmoke, period and heartrte are all numeric variables
#How many unique participants?
a = filter(dat, dat$PERIOD==1)
nrow(a)
## [1] 4434
#New variable for natural log of glucose
dat$lngluc = log(dat$GLUCOSE)
#New variable for BMI category
dat$bmicat <- as.factor(ifelse(dat$BMI < 18.5, 'Underweight',
                          ifelse(dat$BMI < 25, 'Normal', 
                          ifelse(dat$BMI  < 30, 'Overweight', 
                          ifelse (dat$BMI < 35, 'Obesity Class I', 
                          ifelse (dat$BMI < 40, 'Obesity Class II',
                          ifelse(dat$BMI  >= 40, 'Obesity Class III',0)))))))
#Participants with missing data for BMI are marked as NA in this classification
#Restricting dataset to period 3
p3 = filter(dat, dat$PERIOD==3)
nrow(p3)
## [1] 3263
ncol(p3)
## [1] 41
#There are 3263 cases and 41 variables
#Export period 3 dataset
write.csv(p3, file="period3.csv")