Main Goals
This project is the second version the Smoke Free Project.
The R-Markdown will include perform the model processing again with addition information from ANRF, to combine more information of the smoke-free policy status
We first by constructing a predictive variable (Y - smoke-free policy status) only contains the consensus between the two data sources
HUD Aim 4: To investigate the impact of HUD smoke-free policies on biomarkers of secondhand smoke exposure (serum cotinine, blood and urine cadmium, and PAHs) among non-smoking HUD residents in public housing buildings
Review background about HUD’s smoke-free policy (can skim through):
# include some specific Rmarkdown format packages
library(formattable)
library(knitr)
library(kableExtra)
# general R parkages
library(dplyr)
library(data.table)
library(tidyverse)
library(ggplot2)
library(readxl)
library(ISLR)
library(Amelia)
library(mlbench)
library(pscl)
library(Matrix)
library(xgboost)
library(mltools)
library(caret)
library(Hmisc)
library(smbinning)
library(InformationValue)
library(car)
library(broom)
library(modelr)
library(randomForest)
library(rsample)
library(e1071)
library(glue)
library(DataExplorer)
library(reshape2)
library(dummies)
library(table1)
library(AICcmodavg)
library(tibble)
library(DT)
# DecisionTree
library(mlr)
library(parallel)
library(parallelMap)
library(rpart.plot)
library(MASS)
# read the data that contains the SELECTED variables from the AHS of 2015
# setting working directory
opts_knit$set(root.dir = "C:/Users/Steve/Box/HUD NHANES/Kahang Ngau/smokefree")
source("http://www.sthda.com/upload/rquery_cormat.r")
The data files we used for this project are:
HUD_PA
ANRF
# load the csv file for the combined dataset
phbfinal <- read.csv('phbfinal2.csv', header = TRUE)
# Load the codebook from excel
codebook <- read_excel('phb_datadictionary.xlsx', skip = 2)
# load the csv file for the State-code dataset
state_code <- read.csv('state_code.csv', header = TRUE)
state_code$STATE_NAME <- gsub(" ", "_", state_code$STATE_NAME)
df_codebook<-codebook %>%
filter(!(`Source file`=="HUD Physical Insp. DATEs"|
`Source file`=="HUD Smoke Free PHA"|
`Source file`=="HUD Physical Insp. Scores"))
# Convert the SMOKEF_POLICY column into binomial
phbfinal$Smoke_Free<-ifelse(phbfinal$SMOKEF_POLICY=="Yes", 1,
ifelse(phbfinal$SMOKEF_POLICY=="No", 0, NA))
phbfinal$Smoke_Free_ANRF<-ifelse(phbfinal$SMOKEF_POLICY_ANRF=="Yes"|phbfinal$SMOKEF_POLICY_ANRF=="Some", 1,
ifelse(phbfinal$SMOKEF_POLICY_ANRF=="No", 0, NA))
phbfinal$DATE_ANRF <- as.numeric(format(as.Date(as.character(phbfinal$SMOKEFDATE_ANRF), format="%m/%d/%y"),"%Y"))
# Sensitivity Analysis: Model 2 (ANRF)
phbfinal$Smoke_Free <- ifelse((is.na(phbfinal$Smoke_Free) & (!(is.na(phbfinal$Smoke_Free_ANRF)))),
phbfinal$Smoke_Free_ANRF,
phbfinal$Smoke_Free)
# Merge state-code dataset
df_phbfinal<-merge(phbfinal,state_code,by.x = "STATE2KX",by.y = "ï..STATE2KX",all.x = TRUE)
There are a total of 178718 observations in the combined data set.
Yes/True: means the (valid, not-null) data point from ANRF match with the (valid, not-null) from HUD_PA
No/False: means the data point does (either null or not-null) does not match with the data point (either null or not-null) from HUD_PA
NA: means both ANRF and HUD_PA have a missing / null data point
Finding the difference between the variables SMOKEF_IMPLEMENT & SMOKEFDATE_ANRF, which is the Year of SF policy implemented
phb_level<-phbfinal %>%
dplyr::group_by(PARTICIPANT_CODE,Smoke_Free,Smoke_Free_ANRF,SMOKEF_IMPLEMENT,DATE_ANRF) %>%
dplyr::summarise(Count = n())
# Use ifelse statement to find out the similarity of the two columns
phb_level$CHECK_YEAR <- ifelse(phb_level$SMOKEF_IMPLEMENT==phb_level$DATE_ANRF, TRUE, FALSE)
phb_level$CHECK_YEAR <- ifelse((!is.na(phb_level$SMOKEF_IMPLEMENT) & is.na(phb_level$DATE_ANRF)) |
(is.na(phb_level$SMOKEF_IMPLEMENT) & !is.na(phb_level$DATE_ANRF)),
FALSE, phb_level$CHECK_YEAR)
phb_level$CHECK_YEAR <- ifelse(is.na(phb_level$SMOKEF_IMPLEMENT) & is.na(phb_level$DATE_ANRF),
NA, phb_level$CHECK_YEAR)
# Group by and count for the values
df_year<-phb_level %>%
dplyr::group_by(CHECK_YEAR) %>%
dplyr::summarise(Count = n())
# Display dataframe
datatable(df_year[,],
rownames = FALSE,
options=list(pageLength = 5))
ggplot(df_year, aes(x=CHECK_YEAR,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("Year Implement") +
ggtitle("Diff on HUD_PA and ANRF Year of SF Policy Implement") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(phb_level),3)), position = position_dodge(0.9), vjust=-0.25)
df_hud_county <- phbfinal %>%
filter(SMOKEF_POLICY==""|is.na(SMOKEF_POLICY))
hud_county <- df_hud_county %>%
dplyr::group_by(STATE2KX, CNTY2KX) %>%
dplyr::summarise(Count = n())
# Display dataframe
hud_county <- hud_county %>% arrange(desc(as.numeric(Count)))
colnames(hud_county) <- c("HUD_State", "HUD_County", "HUD_Freq")
# ANRF
df_anrf_county <- phbfinal %>%
filter(SMOKEF_POLICY_ANRF==""|is.na(SMOKEF_POLICY_ANRF))
anrf_county <- df_anrf_county %>%
dplyr::group_by(STATE2KX, CNTY2KX) %>%
dplyr::summarise(Count = n())
# Display dataframe
anrf_county <- anrf_county %>% arrange(desc(as.numeric(Count)))
colnames(anrf_county) <- c("ANRF_State", "ANRF_County", "ANRF_Freq")
merfedata <- merge(hud_county, anrf_county, by=0)
datatable(merfedata[1:25,-1],
rownames = FALSE,
options=list(pageLength = 5))
# Only select the variables that we want
df_filter<-df_phbfinal %>%
filter(!(Smoke_Free==""|is.na(Smoke_Free))) %>%
filter(!(BUILDING_TYPE_CODE=="ND")) %>%
dplyr::select(c(df_codebook$`Variable Name`,"Smoke_Free","INSPECTION_SCORE_NHPD_1",
"INSPECTION_SCORE_NHPD_2","INSPECTION_SCORE_NHPD_3", "PARTICIPANT_CODE","STATE_NAME",-"STATE2KX",
"SMOKEF_IMPLEMENT","Smoke_Free_ANRF","DATE_ANRF"))
We decide to only keep columns which have valid values greater than 90%.
# Find out null values by columns
df_null <- data.frame(lapply(df_filter,function(x) {length(which(!(is.na(x)|x==""|x==-4)))}))
df_null <- df_null %>%
bind_rows(summarise_if(., is.numeric, ~ .[1] / NROW(df_filter))) %>%
mutate(across(is.numeric, ~ round(., 3)))
df_null <- data.frame(t(df_null))
colnames(df_null)<-c('Number of Valid Values','Percentage')
# Filtering variables process
df_null<-df_null %>%
filter(Percentage>0.7)
df_null$Percentage<-percent(df_null$Percentage)
datatable(df_null[,])
col_lst<-colnames(t(df_null))
df_select<-df_filter %>%
dplyr::select(col_lst)
sapply(df_select, function(x) length(unique(x)))
## STD_ZIP11 CNTY2KX TRACT2KX
## 66880 140 3280
## HUD_TYPE BUILDING_TYPE_CODE UR
## 1 7 2
## CONSTRUCT_DATE TOTAL_DWELLING_UNITS TOTAL_OCCUPIED
## 2792 262 259
## YEAR_NHPD_1 YEAR_NHPD_2 YEAR_NHPD_3
## 11 13 15
## YEAR_18 YEAR_16 YEAR_15_1
## 7 6 5
## YEAR_11_1 Smoke_Free INSPECTION_SCORE_NHPD_1
## 5 2 103
## INSPECTION_SCORE_NHPD_2 INSPECTION_SCORE_NHPD_3 PARTICIPANT_CODE
## 93 98 768
## STATE_NAME SMOKEF_IMPLEMENT Smoke_Free_ANRF
## 49 17 3
## DATE_ANRF
## 17
Since we know that variable STD_ZIP11 is the unique 11 zip code Postnet Bar Code for each building, we will remove this variable. The HUD_TYPE variable only has one unique value, we will remove it as well. And we will also remove the Year of 18, 16, and 11_1 variables.
Extract the YEAR value from the CONSTRUCT_DATE column.
**Extract Inspection Scores AND scores with ’*’**
Definition of REAC Scores with different categories:
a: No health and safety deficiencies observed
b: One or more non-life threatening health and safety deficiencies observed
c: One or more health and safety deficiencies observed calling for immediate attention
’*’: Deficiencies found regarding smoke detectors
We also perform a correlation metric for all numeric variables of the final data set.
# Feature analysis
df_select$BUILDING_TYPE_CODE <- toupper(df_select$BUILDING_TYPE_CODE)
df_select$YEAR_CONSTRU <- as.numeric(format(as.Date(as.character(df_select$CONSTRUCT_DATE), format="%Y-%m-%d"),"%Y"))
# For NHPD data: EXTRACT inspection scores, inspection letter, and inspection results with *.
Letter_func<-function(input_col){
ifelse(grepl("a", input_col,fixed = TRUE),"a",
ifelse(grepl("b", input_col,fixed = TRUE),"b",
ifelse(grepl("c", input_col,fixed = TRUE),"c",NA)))
}
df_select$REAC_SCORE_NHPD1_LETTER <- Letter_func(input_col = df_select$INSPECTION_SCORE_NHPD_1)
df_select$REAC_SCORE_NHPD2_LETTER <- Letter_func(input_col = df_select$INSPECTION_SCORE_NHPD_2)
df_select$REAC_SCORE_NHPD3_LETTER <- Letter_func(input_col = df_select$INSPECTION_SCORE_NHPD_3)
df_select$REAC_SCORE_NHPD1 <- as.numeric(gsub("([0-9]+).*$", "\\1", df_select$INSPECTION_SCORE_NHPD_1))
df_select$REAC_SCORE_NHPD2 <- as.numeric(gsub("([0-9]+).*$", "\\1", df_select$INSPECTION_SCORE_NHPD_2))
df_select$REAC_SCORE_NHPD3 <- as.numeric(gsub("([0-9]+).*$", "\\1", df_select$INSPECTION_SCORE_NHPD_3))
df_select$REAC_SCORE_NHPD1_CODE <- ifelse(grepl('[^[:alnum:]]', df_select$INSPECTION_SCORE_NHPD_1),1,0)
df_select$REAC_SCORE_NHPD2_CODE <- ifelse(grepl('[^[:alnum:]]', df_select$INSPECTION_SCORE_NHPD_2),1,0)
df_select$REAC_SCORE_NHPD3_CODE <- ifelse(grepl('[^[:alnum:]]', df_select$INSPECTION_SCORE_NHPD_3),1,0)
df_try <- df_select %>%
dplyr::group_by(PARTICIPANT_CODE, Smoke_Free, STATE_NAME, BUILDING_TYPE_CODE,UR,REAC_SCORE_NHPD1_LETTER,REAC_SCORE_NHPD2_LETTER,REAC_SCORE_NHPD3_LETTER,
REAC_SCORE_NHPD1_CODE,REAC_SCORE_NHPD2_CODE,REAC_SCORE_NHPD3_CODE) %>%
dplyr::summarise(
TOTAL_DWELLING_UNITS=mean(TOTAL_DWELLING_UNITS, na.rm=TRUE),
YEAR_CONSTRU=mean(YEAR_CONSTRU, na.rm=TRUE),
REAC_SCORE_NHPD1=mean(REAC_SCORE_NHPD1, na.rm=TRUE),
REAC_SCORE_NHPD2=mean(REAC_SCORE_NHPD2, na.rm=TRUE),
REAC_SCORE_NHPD3=mean(REAC_SCORE_NHPD3, na.rm=TRUE))
df_try<-as.data.frame(df_try)
#knitr::kable(df_try[1:10,])
plot_missing(df_try)
YEAR_CONSTRUCT, DWELLING_UNITS, Total_Building_Type : at the building level (STD_ZIP11)
Inspection scores: at the county level
Urban/Rural Indicator (UR): at the census track level
# Find out null values by columns
df_state<-df_try %>% dplyr::group_by(STATE_NAME) %>% dplyr::summarise(Count = n())
df_state$Total<-nrow(df_try)
df_state$Percentage<-df_state$Count/df_state$Total
df_state <- df_state %>%
mutate(across(is.numeric, ~ round(., 3))) %>%
dplyr::select(STATE_NAME,Count,Percentage) %>%
arrange(-Percentage)
df_state$Percentage<-percent(df_state$Percentage)
datatable(df_state[,])
ggplot(data = df_try) +
geom_histogram(mapping = aes(x = REAC_SCORE_NHPD1),binwidth = 1) +
ggtitle("Distribution of Inspection Score 1") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df_try) +
geom_histogram(mapping = aes(x = REAC_SCORE_NHPD2),binwidth = 1) +
ggtitle("Distribution of Inspection Score 2") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df_try) +
geom_histogram(mapping = aes(x = REAC_SCORE_NHPD3),binwidth = 1) +
ggtitle("Distribution of Inspection Score 3") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
df_try<-df_try %>% mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))
# Convert the following two catergorical variables to factor
cols<-c("STATE_NAME","PARTICIPANT_CODE","BUILDING_TYPE_CODE","UR","REAC_SCORE_NHPD1_LETTER","REAC_SCORE_NHPD2_LETTER",
"REAC_SCORE_NHPD3_LETTER","REAC_SCORE_NHPD1_CODE","REAC_SCORE_NHPD2_CODE","REAC_SCORE_NHPD3_CODE")
df_try[cols] <- lapply(df_try[cols], factor)
# These are the final variables that to keep
df_final<-df_try %>%
dplyr::select(-REAC_SCORE_NHPD1_CODE, -REAC_SCORE_NHPD2_CODE, -REAC_SCORE_NHPD3_CODE,
-REAC_SCORE_NHPD1_LETTER, -REAC_SCORE_NHPD2_LETTER, -REAC_SCORE_NHPD3_LETTER,
-PARTICIPANT_CODE)
df_final$DWELLING_UNITS <- as.factor(ifelse(df_final$TOTAL_DWELLING_UNITS<=10, "0_10",
ifelse(df_final$TOTAL_DWELLING_UNITS<=25, "11_25",
ifelse(df_final$TOTAL_DWELLING_UNITS<=50, "26_50",
"51_or_More"))))
df_final$YEAR_CONSTRU_NEW <- as.factor(ifelse(df_final$YEAR_CONSTRU<=1960, "1960_or_Earlier",
ifelse(df_final$YEAR_CONSTRU<=1980, "1961_1980",
ifelse(df_final$YEAR_CONSTRU<=2000, "1981_2000",
"2001_or_Later"))))
df_final<-df_final %>% dplyr::select(-TOTAL_DWELLING_UNITS,-YEAR_CONSTRU)
datatable(df_final[,],
rownames = FALSE,
options=list(pageLength = 10))
table1(~factor(Smoke_Free)+BUILDING_TYPE_CODE+UR+REAC_SCORE_NHPD1+
REAC_SCORE_NHPD2+REAC_SCORE_NHPD3+factor(DWELLING_UNITS)+factor(YEAR_CONSTRU_NEW)
,row_wise = TRUE,test=TRUE,format_number = TRUE, data=df_final, overall="Total",digits=3)
| Total (N=2112) |
|
|---|---|
| factor(Smoke_Free) | |
| 0 | 1029 (48.7%) |
| 1 | 1083 (51.3%) |
| BUILDING_TYPE_CODE | |
| ES | 425 (20.1%) |
| RW | 552 (26.1%) |
| SD | 481 (22.8%) |
| SF | 375 (17.8%) |
| WU | 279 (13.2%) |
| UR | |
| R | 320 (15.2%) |
| U | 1792 (84.8%) |
| REAC_SCORE_NHPD1 | |
| Mean (SD) | 82.4 (12.7) |
| Median [Min, Max] | 85.3 [8.00, 99.3] |
| REAC_SCORE_NHPD2 | |
| Mean (SD) | 85.3 (11.7) |
| Median [Min, Max] | 88.0 [17.0, 100] |
| REAC_SCORE_NHPD3 | |
| Mean (SD) | 85.0 (10.8) |
| Median [Min, Max] | 87.0 [27.0, 100] |
| factor(DWELLING_UNITS) | |
| 0_10 | 1556 (73.7%) |
| 11_25 | 118 (5.6%) |
| 26_50 | 124 (5.9%) |
| 51_or_More | 314 (14.9%) |
| factor(YEAR_CONSTRU_NEW) | |
| 1960_or_Earlier | 146 (6.9%) |
| 1961_1980 | 1376 (65.2%) |
| 1981_2000 | 478 (22.6%) |
| 2001_or_Later | 112 (5.3%) |
# Display correlation metric
res2 <- df_final %>% dplyr::select(REAC_SCORE_NHPD1,REAC_SCORE_NHPD2,REAC_SCORE_NHPD3,Smoke_Free)
res3 <- cor(res2)
knitr::kable(res3[,])
| REAC_SCORE_NHPD1 | REAC_SCORE_NHPD2 | REAC_SCORE_NHPD3 | Smoke_Free | |
|---|---|---|---|---|
| REAC_SCORE_NHPD1 | 1.0000000 | 0.4797795 | 0.4125118 | -0.1028083 |
| REAC_SCORE_NHPD2 | 0.4797795 | 1.0000000 | 0.4721206 | -0.0911106 |
| REAC_SCORE_NHPD3 | 0.4125118 | 0.4721206 | 1.0000000 | -0.0798568 |
| Smoke_Free | -0.1028083 | -0.0911106 | -0.0798568 | 1.0000000 |
## Correlation Metric
rquery.cormat(res2)
## $r
## Smoke_Free REAC_SCORE_NHPD3 REAC_SCORE_NHPD1 REAC_SCORE_NHPD2
## Smoke_Free 1
## REAC_SCORE_NHPD3 -0.08 1
## REAC_SCORE_NHPD1 -0.1 0.41 1
## REAC_SCORE_NHPD2 -0.091 0.47 0.48 1
##
## $p
## Smoke_Free REAC_SCORE_NHPD3 REAC_SCORE_NHPD1 REAC_SCORE_NHPD2
## Smoke_Free 0
## REAC_SCORE_NHPD3 0.00024 0
## REAC_SCORE_NHPD1 2.2e-06 1.4e-87 0
## REAC_SCORE_NHPD2 2.7e-05 1e-117 4.9e-122 0
##
## $sym
## Smoke_Free REAC_SCORE_NHPD3 REAC_SCORE_NHPD1 REAC_SCORE_NHPD2
## Smoke_Free 1
## REAC_SCORE_NHPD3 1
## REAC_SCORE_NHPD1 . 1
## REAC_SCORE_NHPD2 . . 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
u_r<-df_final %>% group_by(UR) %>% summarise(Count = n())
knitr::kable(u_r[,])
| UR | Count |
|---|---|
| R | 320 |
| U | 1792 |
ggplot(u_r, aes(x=UR,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("UR") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
ES=Elevator Structure
RW=Row or Townhouse
SD=Semi-Detached
SF=Single Family/Detached(duplex)
WU=Walk-Up/Multifamily Apt.
Use door numbers in Unit Spreadsheet (table).
htype<-df_final %>% group_by(BUILDING_TYPE_CODE) %>% summarise(Count = n())
knitr::kable(htype[,])
| BUILDING_TYPE_CODE | Count |
|---|---|
| ES | 425 |
| RW | 552 |
| SD | 481 |
| SF | 375 |
| WU | 279 |
ggplot(htype, aes(x=BUILDING_TYPE_CODE,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("BUILDING TYPE CODE") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
# df_final$YEAR_CONSTRU_NEW<-factor(df_final$YEAR_CONSTRU_NEW, levels = c("1960_or_Earlier",
# "1960_1980",
# "1980_2000",
# "2000_or_Later"))
year_built<-df_final %>% group_by(YEAR_CONSTRU_NEW) %>% summarise(Count = n())
knitr::kable(year_built[,])
| YEAR_CONSTRU_NEW | Count |
|---|---|
| 1960_or_Earlier | 146 |
| 1961_1980 | 1376 |
| 1981_2000 | 478 |
| 2001_or_Later | 112 |
ggplot(year_built, aes(x=YEAR_CONSTRU_NEW,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("Construction Year") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
dwelling_unit<-df_final %>% group_by(DWELLING_UNITS) %>% summarise(Count = n())
knitr::kable(dwelling_unit[,])
| DWELLING_UNITS | Count |
|---|---|
| 0_10 | 1556 |
| 11_25 | 118 |
| 26_50 | 124 |
| 51_or_More | 314 |
ggplot(dwelling_unit, aes(x=DWELLING_UNITS,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("Dwelling Units") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
In this section, we will perform One-Hot Encoding, train-test-split the data, and implement XGBoost model.
This ML model is used for better selecting feature variables to the final model: GLM
# XGBoost model
# Create Training Data
input_ones <- df_final[which(df_final$Smoke_Free == 1), ] # all 1's
input_zeros <- df_final[which(df_final$Smoke_Free == 0), ] # all 0's
# for repeatability of samples
set.seed(314)
input_ones_training_rows <- sample(1:nrow(input_ones), 0.7*nrow(input_ones))
input_zeros_training_rows <- sample(1:nrow(input_zeros), 0.7*nrow(input_ones))
training_ones <- input_ones[input_ones_training_rows, ]
training_zeros <- input_zeros[input_zeros_training_rows, ]
# row bind the 1's and 0's
trainingData <- rbind(training_ones, training_zeros)
# Create Test Data
test_ones <- input_ones[-input_ones_training_rows, ]
test_zeros <- input_zeros[-input_zeros_training_rows, ]
# row bind the 1's and 0's
testData <- rbind(test_ones, test_zeros)
# Finally, output the three dataframes for training, validation and test.
dfTraining_x <- one_hot(as.data.table(trainingData %>% dplyr::select(-Smoke_Free)))#,-Weight)))
dfTraining <- cbind(dfTraining_x, as.numeric(as.character(trainingData$Smoke_Free))) %>% dplyr::rename(Smoke_Free = V2)
dfTest_x <- one_hot(as.data.table(testData%>% dplyr::select(-Smoke_Free)))#,-Weight)))
dfTest <- cbind(dfTest_x, as.numeric(as.character(testData$Smoke_Free))) %>% dplyr::rename(Smoke_Free = V2)
# Conduct DMatrix
xg.train <- xgb.DMatrix(data = as.matrix(dfTraining %>% dplyr::select(-Smoke_Free)),
label = dfTraining$Smoke_Free)
#,weight = dfTraining$Weight)
xg.test <- xgb.DMatrix(data = as.matrix(dfTest %>% dplyr::select(-Smoke_Free)),
label = dfTest$Smoke_Free)
#weight = dfTest$Weight)
set.seed(314)
bst <- xgb.train(data = xg.train,
objective = "binary:logistic",
nrounds = 1000,
eta = 0.02,
max_depth = 10,
min_child_weight = 10)
pred <- predict(bst, xg.test)
prediction <- as.numeric(pred > 0.5)
err <- mean(as.numeric(pred > 0.5) != testData$Smoke_Free)
The length of the prediction is 596.
Show the fist 5 results of the prediction: 1, 0, 0, 1, 1, 1.
The overall error from the test data set is: 0.3104027.
# Importance Matrix
importance_matrix <- xgb.importance(model = bst)
print(importance_matrix)
## Feature Gain Cover Frequency
## 1: REAC_SCORE_NHPD2 0.2725354110 0.284929308 0.2947799386
## 2: REAC_SCORE_NHPD3 0.2236264331 0.225247409 0.2691914023
## 3: REAC_SCORE_NHPD1 0.2147400995 0.236332325 0.2612439039
## 4: STATE_NAME_Michigan 0.0713406959 0.016310054 0.0056595822
## 5: STATE_NAME_Nebraska 0.0377331013 0.015929246 0.0059004154
## 6: STATE_NAME_Illinois 0.0326031845 0.022605551 0.0092720814
## 7: STATE_NAME_New_York 0.0178770547 0.029131891 0.0131856223
## 8: YEAR_CONSTRU_NEW_1961_1980 0.0175785124 0.008410799 0.0208320790
## 9: YEAR_CONSTRU_NEW_1960_or_Earlier 0.0141763359 0.014600011 0.0114395810
## 10: STATE_NAME_Washington 0.0129795612 0.008026513 0.0031308327
## 11: STATE_NAME_Texas 0.0102955531 0.023019933 0.0143897887
## 12: UR_R 0.0097281843 0.010546942 0.0108374977
## 13: YEAR_CONSTRU_NEW_1981_2000 0.0096928275 0.009857317 0.0093322897
## 14: BUILDING_TYPE_CODE_RW 0.0091539173 0.009802277 0.0074056235
## 15: DWELLING_UNITS_0_10 0.0068916750 0.004973185 0.0087904148
## 16: STATE_NAME_Massachusetts 0.0059356041 0.014299241 0.0078270817
## 17: STATE_NAME_Pennsylvania 0.0054510033 0.008669392 0.0052381239
## 18: STATE_NAME_California 0.0053758845 0.008979497 0.0069239569
## 19: BUILDING_TYPE_CODE_WU 0.0040520676 0.009826072 0.0074056235
## 20: BUILDING_TYPE_CODE_ES 0.0033600122 0.003723111 0.0045758324
## 21: YEAR_CONSTRU_NEW_2001_or_Later 0.0033445974 0.010177305 0.0042747908
## 22: STATE_NAME_Minnesota 0.0026956864 0.008672021 0.0047564573
## 23: STATE_NAME_Florida 0.0025878442 0.005515911 0.0026491661
## 24: BUILDING_TYPE_CODE_SF 0.0023487892 0.002947906 0.0031308327
## 25: BUILDING_TYPE_CODE_SD 0.0016863234 0.001764472 0.0028899994
## 26: DWELLING_UNITS_51_or_More 0.0014795383 0.002169617 0.0027093744
## 27: DWELLING_UNITS_26_50 0.0004193118 0.002463306 0.0015052080
## 28: DWELLING_UNITS_11_25 0.0003107907 0.001069391 0.0007224998
## Feature Gain Cover Frequency
xgb.plot.importance(importance_matrix = importance_matrix %>% head(30))
# Evaluation Model
# Calculate RMSE
residuals = dfTest$Smoke_Free - pred
RMSE = sqrt(mean(residuals**2))
The RMSE is: 0.4514.
CM<-sum(ifelse(as.numeric(pred>0.5)==dfTest$Smoke_Free, 1, 0)) / length(pred)
The overall rate of accuracy is: 0.6896.
# Calibration Plot
response<-ifelse(dfTest$Smoke_Free==1, "Does Not have Smoke Free Policy", "Has Smoke Free Policy")
testProbs<-data.frame(obs = as.factor(response), lr = pred)
calplotdata<-calibration(obs ~ lr, data = testProbs)
xyplot(calplotdata, auto.key = list(columns = 2))
top_30 <- (importance_matrix %>% head(30))$Feature
glm_function <- as.formula(paste("Smoke_Free ~ ",paste(top_30, collapse = "+")))
logitMod <-glm(glm_function,data=dfTraining, family=binomial)
summary(logitMod)
##
## Call:
## glm(formula = glm_function, family = binomial, data = dfTraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.16364 -1.11281 0.07605 1.09242 2.37981
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4837661 0.6725156 2.206 0.0274 *
## REAC_SCORE_NHPD2 -0.0060457 0.0058923 -1.026 0.3049
## REAC_SCORE_NHPD3 0.0004433 0.0059982 0.074 0.9411
## REAC_SCORE_NHPD1 -0.0078134 0.0052597 -1.486 0.1374
## STATE_NAME_Michigan -2.7488474 0.4705897 -5.841 5.18e-09 ***
## STATE_NAME_Nebraska -1.5109235 0.3441419 -4.390 1.13e-05 ***
## STATE_NAME_Illinois 1.7910805 0.3278121 5.464 4.66e-08 ***
## STATE_NAME_New_York 1.2363476 0.3042012 4.064 4.82e-05 ***
## YEAR_CONSTRU_NEW_1961_1980 -0.3132138 0.2628655 -1.192 0.2334
## YEAR_CONSTRU_NEW_1960_or_Earlier -0.6302046 0.3350885 -1.881 0.0600 .
## STATE_NAME_Washington 1.5681360 0.3825930 4.099 4.15e-05 ***
## STATE_NAME_Texas 0.4900411 0.2311101 2.120 0.0340 *
## UR_R -0.2424715 0.1649122 -1.470 0.1415
## YEAR_CONSTRU_NEW_1981_2000 -0.7143100 0.2805992 -2.546 0.0109 *
## BUILDING_TYPE_CODE_RW -0.2758473 0.1600686 -1.723 0.0848 .
## DWELLING_UNITS_0_10 0.0990069 0.3049574 0.325 0.7454
## STATE_NAME_Massachusetts 0.5196006 0.2529267 2.054 0.0399 *
## STATE_NAME_Pennsylvania -0.2635950 0.2484389 -1.061 0.2887
## STATE_NAME_California -0.1224184 0.2392452 -0.512 0.6089
## BUILDING_TYPE_CODE_WU 0.1068861 0.2248249 0.475 0.6345
## BUILDING_TYPE_CODE_ES 0.2695056 0.4844810 0.556 0.5780
## YEAR_CONSTRU_NEW_2001_or_Later NA NA NA NA
## STATE_NAME_Minnesota 0.3064611 0.2679698 1.144 0.2528
## STATE_NAME_Florida -0.3909693 0.2817487 -1.388 0.1652
## BUILDING_TYPE_CODE_SF 0.1770230 0.1764566 1.003 0.3158
## BUILDING_TYPE_CODE_SD NA NA NA NA
## DWELLING_UNITS_51_or_More -0.3406373 0.4884976 -0.697 0.4856
## DWELLING_UNITS_26_50 -0.2477789 0.4580233 -0.541 0.5885
## DWELLING_UNITS_11_25 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2101.6 on 1515 degrees of freedom
## Residual deviance: 1854.0 on 1490 degrees of freedom
## AIC: 1906
##
## Number of Fisher Scoring iterations: 5
set.seed(314)
logitMod.full <- glm(Smoke_Free ~ REAC_SCORE_NHPD2 + REAC_SCORE_NHPD3 + REAC_SCORE_NHPD1 +
STATE_NAME_Michigan + STATE_NAME_Nebraska + STATE_NAME_Illinois +
STATE_NAME_New_York + YEAR_CONSTRU_NEW_1961_1980 + YEAR_CONSTRU_NEW_1960_or_Earlier +
STATE_NAME_Washington + STATE_NAME_Texas + UR_R + YEAR_CONSTRU_NEW_1981_2000 +
BUILDING_TYPE_CODE_RW + DWELLING_UNITS_0_10 + STATE_NAME_Massachusetts +
STATE_NAME_Pennsylvania + STATE_NAME_California + BUILDING_TYPE_CODE_WU +
BUILDING_TYPE_CODE_ES +
STATE_NAME_Minnesota + STATE_NAME_Florida + BUILDING_TYPE_CODE_SF +
DWELLING_UNITS_51_or_More + DWELLING_UNITS_26_50
,
data=dfTraining, family=binomial)
summary(logitMod.full)
##
## Call:
## glm(formula = Smoke_Free ~ REAC_SCORE_NHPD2 + REAC_SCORE_NHPD3 +
## REAC_SCORE_NHPD1 + STATE_NAME_Michigan + STATE_NAME_Nebraska +
## STATE_NAME_Illinois + STATE_NAME_New_York + YEAR_CONSTRU_NEW_1961_1980 +
## YEAR_CONSTRU_NEW_1960_or_Earlier + STATE_NAME_Washington +
## STATE_NAME_Texas + UR_R + YEAR_CONSTRU_NEW_1981_2000 + BUILDING_TYPE_CODE_RW +
## DWELLING_UNITS_0_10 + STATE_NAME_Massachusetts + STATE_NAME_Pennsylvania +
## STATE_NAME_California + BUILDING_TYPE_CODE_WU + BUILDING_TYPE_CODE_ES +
## STATE_NAME_Minnesota + STATE_NAME_Florida + BUILDING_TYPE_CODE_SF +
## DWELLING_UNITS_51_or_More + DWELLING_UNITS_26_50, family = binomial,
## data = dfTraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.16364 -1.11281 0.07605 1.09242 2.37981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4837661 0.6725156 2.206 0.0274 *
## REAC_SCORE_NHPD2 -0.0060457 0.0058923 -1.026 0.3049
## REAC_SCORE_NHPD3 0.0004433 0.0059982 0.074 0.9411
## REAC_SCORE_NHPD1 -0.0078134 0.0052597 -1.486 0.1374
## STATE_NAME_Michigan -2.7488474 0.4705897 -5.841 5.18e-09 ***
## STATE_NAME_Nebraska -1.5109235 0.3441419 -4.390 1.13e-05 ***
## STATE_NAME_Illinois 1.7910805 0.3278121 5.464 4.66e-08 ***
## STATE_NAME_New_York 1.2363476 0.3042012 4.064 4.82e-05 ***
## YEAR_CONSTRU_NEW_1961_1980 -0.3132138 0.2628655 -1.192 0.2334
## YEAR_CONSTRU_NEW_1960_or_Earlier -0.6302046 0.3350885 -1.881 0.0600 .
## STATE_NAME_Washington 1.5681360 0.3825930 4.099 4.15e-05 ***
## STATE_NAME_Texas 0.4900411 0.2311101 2.120 0.0340 *
## UR_R -0.2424715 0.1649122 -1.470 0.1415
## YEAR_CONSTRU_NEW_1981_2000 -0.7143100 0.2805992 -2.546 0.0109 *
## BUILDING_TYPE_CODE_RW -0.2758473 0.1600686 -1.723 0.0848 .
## DWELLING_UNITS_0_10 0.0990069 0.3049574 0.325 0.7454
## STATE_NAME_Massachusetts 0.5196006 0.2529267 2.054 0.0399 *
## STATE_NAME_Pennsylvania -0.2635950 0.2484389 -1.061 0.2887
## STATE_NAME_California -0.1224184 0.2392452 -0.512 0.6089
## BUILDING_TYPE_CODE_WU 0.1068861 0.2248249 0.475 0.6345
## BUILDING_TYPE_CODE_ES 0.2695056 0.4844810 0.556 0.5780
## STATE_NAME_Minnesota 0.3064611 0.2679698 1.144 0.2528
## STATE_NAME_Florida -0.3909693 0.2817487 -1.388 0.1652
## BUILDING_TYPE_CODE_SF 0.1770230 0.1764566 1.003 0.3158
## DWELLING_UNITS_51_or_More -0.3406373 0.4884976 -0.697 0.4856
## DWELLING_UNITS_26_50 -0.2477789 0.4580233 -0.541 0.5885
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2101.6 on 1515 degrees of freedom
## Residual deviance: 1854.0 on 1490 degrees of freedom
## AIC: 1906
##
## Number of Fisher Scoring iterations: 5
coef(logitMod.full)
## (Intercept) REAC_SCORE_NHPD2
## 1.4837660674 -0.0060457225
## REAC_SCORE_NHPD3 REAC_SCORE_NHPD1
## 0.0004433476 -0.0078133850
## STATE_NAME_Michigan STATE_NAME_Nebraska
## -2.7488474284 -1.5109234683
## STATE_NAME_Illinois STATE_NAME_New_York
## 1.7910804675 1.2363475776
## YEAR_CONSTRU_NEW_1961_1980 YEAR_CONSTRU_NEW_1960_or_Earlier
## -0.3132138177 -0.6302045846
## STATE_NAME_Washington STATE_NAME_Texas
## 1.5681359810 0.4900410779
## UR_R YEAR_CONSTRU_NEW_1981_2000
## -0.2424715488 -0.7143099606
## BUILDING_TYPE_CODE_RW DWELLING_UNITS_0_10
## -0.2758473236 0.0990068888
## STATE_NAME_Massachusetts STATE_NAME_Pennsylvania
## 0.5196005762 -0.2635949690
## STATE_NAME_California BUILDING_TYPE_CODE_WU
## -0.1224184128 0.1068861395
## BUILDING_TYPE_CODE_ES STATE_NAME_Minnesota
## 0.2695056392 0.3064611156
## STATE_NAME_Florida BUILDING_TYPE_CODE_SF
## -0.3909692505 0.1770229859
## DWELLING_UNITS_51_or_More DWELLING_UNITS_26_50
## -0.3406373467 -0.2477788980
Since log(odds) are hard to interpret, we will transform it by exponentiating the outcome as follow
exp(coef(logitMod.full))
## (Intercept) REAC_SCORE_NHPD2
## 4.40952100 0.99397252
## REAC_SCORE_NHPD3 REAC_SCORE_NHPD1
## 1.00044345 0.99221706
## STATE_NAME_Michigan STATE_NAME_Nebraska
## 0.06400159 0.22070607
## STATE_NAME_Illinois STATE_NAME_New_York
## 5.99592737 3.44301513
## YEAR_CONSTRU_NEW_1961_1980 YEAR_CONSTRU_NEW_1960_or_Earlier
## 0.73109358 0.53248285
## STATE_NAME_Washington STATE_NAME_Texas
## 4.79769686 1.63238327
## UR_R YEAR_CONSTRU_NEW_1981_2000
## 0.78468607 0.48952979
## BUILDING_TYPE_CODE_RW DWELLING_UNITS_0_10
## 0.75892879 1.10407391
## STATE_NAME_Massachusetts STATE_NAME_Pennsylvania
## 1.68135594 0.76828466
## STATE_NAME_California BUILDING_TYPE_CODE_WU
## 0.88477809 1.11280754
## BUILDING_TYPE_CODE_ES STATE_NAME_Minnesota
## 1.30931702 1.35860864
## STATE_NAME_Florida BUILDING_TYPE_CODE_SF
## 0.67640095 1.19365853
## DWELLING_UNITS_51_or_More DWELLING_UNITS_26_50
## 0.71131682 0.78053250
Misclassification error is the percentage mismatch of predcited vs actuals, irrespective of 1’s or 0’s. The lower the misclassification error, the better is the model.
# predicted scores
predicted <- logitMod.full %>% predict(dfTest,type = "response")
optCutOff <- optimalCutoff(dfTest$Smoke_Free, predicted,
#optimiseFor = "Both",
returnDiagnostics = FALSE)
model = pscl::pR2(logitMod.full)["McFadden"]
miss<-misClassError(dfTest$Smoke_Free, predicted, threshold = optCutOff)
The optimal CutOff value is: 0.4062.
The McFadden is: 0.1178.
The Misclassification Error is: 0.3641.
Concordance(testData$Smoke_Free, predicted)
## $Concordance
## [1] 0.6603463
##
## $Discordance
## [1] 0.3396537
##
## $Tied
## [1] 5.551115e-17
##
## $Pairs
## [1] 88075
predicted.classes <- ifelse(predicted > optCutOff, 1, 0)
m_acc<-mean(predicted.classes == dfTest$Smoke_Free)
table(predicted.classes, dfTest$Smoke_Free)
##
## predicted.classes 0 1
## 0 85 31
## 1 186 294
It is drawn by the False positive rate on X-axis and the True Positive rate on Y-axis.
It considers all possible threshold values and corresponds specificity and sensitivity rate, reflecting the final model accuracy.
plotROC(testData$Smoke_Free, predicted)