Main Goals
This project finds the distribution of the data, and compare the difference between two data source (ANRF vs. HUD_PA)
Identify predictors of HUD’s public housing buildings that have smoke-free policies
Develop a model to predict smoke-policy status for housing authorities missing
HUD Aim 4: To investigate the impact of HUD smoke-free policies on biomarkers of secondhand smoke exposure (serum cotinine, blood and urine cadmium, and PAHs) among non-smoking HUD residents in public housing buildings
Review background about HUD’s smoke-free policy (can skim through):
# include some specific Rmarkdown format packages
library(formattable)
library(knitr)
library(kableExtra)
# general R parkages
library(dplyr)
library(data.table)
library(tidyverse)
library(ggplot2)
library(readxl)
library(ISLR)
library(Amelia)
library(mlbench)
library(pscl)
library(Matrix)
library(xgboost)
library(mltools)
library(caret)
library(Hmisc)
library(smbinning)
library(InformationValue)
library(car)
library(broom)
library(modelr)
library(randomForest)
library(rsample)
library(e1071)
library(glue)
library(DataExplorer)
library(reshape2)
library(dummies)
library(table1)
library(AICcmodavg)
library(tibble)
library(DT)
# DecisionTree
library(mlr)
library(parallel)
library(parallelMap)
library(rpart.plot)
library(MASS)
# read the data that contains the SELECTED variables from the AHS of 2015
# setting working directory
opts_knit$set(root.dir = "C:/Users/Steve/Box/HUD NHANES/Kahang Ngau/smokefree")
source("http://www.sthda.com/upload/rquery_cormat.r")
The data files we used for this project are:
HUD_PA
ANRF
# load the csv file for the combined dataset
phbfinal <- read.csv('phbfinal2.csv', header = TRUE)
# Load the codebook from excel
codebook <- read_excel('phb_datadictionary.xlsx', skip = 2)
# load the csv file for the State-code dataset
state_code <- read.csv('state_code.csv', header = TRUE)
state_code$STATE_NAME <- gsub(" ", "_", state_code$STATE_NAME)
df_codebook<-codebook %>%
filter(!(`Source file`=="HUD Physical Insp. DATEs"|
`Source file`=="HUD Smoke Free PHA"|
`Source file`=="HUD Physical Insp. Scores"))
# Convert the SMOKEF_POLICY column into binomial
phbfinal$Smoke_Free<-ifelse(phbfinal$SMOKEF_POLICY=="Yes", 1,
ifelse(phbfinal$SMOKEF_POLICY=="No", 0, NA))
phbfinal$Smoke_Free_ANRF<-ifelse(phbfinal$SMOKEF_POLICY_ANRF=="Yes"|phbfinal$SMOKEF_POLICY_ANRF=="Some", 1,
ifelse(phbfinal$SMOKEF_POLICY_ANRF=="No", 0, NA))
phbfinal$DATE_ANRF <- as.numeric(format(as.Date(as.character(phbfinal$SMOKEFDATE_ANRF), format="%m/%d/%y"),"%Y"))
# Merge state-code dataset
df_phbfinal<-merge(phbfinal,state_code,by.x = "STATE2KX",by.y = "ï..STATE2KX",all.x = TRUE)
There are a total of 178718 observations in the combined data set.
Yes/True: means the (valid, not-null) data point from ANRF match with the (valid, not-null) from HUD_PA
No/False: means the data point does (either null or not-null) does not match with the data point (either null or not-null) from HUD_PA
NA: means both ANRF and HUD_PA have a missing / null data point
Finding the difference between the variables SMOKEF_IMPLEMENT & SMOKEFDATE_ANRF, which is the Year of SF policy implemented
phb_level<-phbfinal %>%
dplyr::group_by(PARTICIPANT_CODE,Smoke_Free,Smoke_Free_ANRF,SMOKEF_IMPLEMENT,DATE_ANRF) %>%
dplyr::summarise(Count = n())
# Use ifelse statement to find out the similarity of the two columns
phb_level$CHECK_YEAR <- ifelse(phb_level$SMOKEF_IMPLEMENT==phb_level$DATE_ANRF, TRUE, FALSE)
phb_level$CHECK_YEAR <- ifelse((!is.na(phb_level$SMOKEF_IMPLEMENT) & is.na(phb_level$DATE_ANRF)) |
(is.na(phb_level$SMOKEF_IMPLEMENT) & !is.na(phb_level$DATE_ANRF)),
FALSE, phb_level$CHECK_YEAR)
phb_level$CHECK_YEAR <- ifelse(is.na(phb_level$SMOKEF_IMPLEMENT) & is.na(phb_level$DATE_ANRF),
NA, phb_level$CHECK_YEAR)
# Group by and count for the values
df_year<-phb_level %>%
dplyr::group_by(CHECK_YEAR) %>%
dplyr::summarise(Count = n())
# Display dataframe
datatable(df_year[,],
rownames = FALSE,
options=list(pageLength = 5))
ggplot(df_year, aes(x=CHECK_YEAR,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("Year Implement") +
ggtitle("Diff on HUD_PA and ANRF Year of SF Policy Implement") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(phb_level),3)), position = position_dodge(0.9), vjust=-0.25)
df_hud_county <- phbfinal %>%
filter(SMOKEF_POLICY==""|is.na(SMOKEF_POLICY))
hud_county <- df_hud_county %>%
dplyr::group_by(STATE2KX, CNTY2KX) %>%
dplyr::summarise(Count = n())
# Display dataframe
hud_county <- hud_county %>% arrange(desc(as.numeric(Count)))
colnames(hud_county) <- c("HUD_State", "HUD_County", "HUD_Freq")
# ANRF
df_anrf_county <- phbfinal %>%
filter(SMOKEF_POLICY_ANRF==""|is.na(SMOKEF_POLICY_ANRF))
anrf_county <- df_anrf_county %>%
dplyr::group_by(STATE2KX, CNTY2KX) %>%
dplyr::summarise(Count = n())
# Display dataframe
anrf_county <- anrf_county %>% arrange(desc(as.numeric(Count)))
colnames(anrf_county) <- c("ANRF_State", "ANRF_County", "ANRF_Freq")
merfedata <- merge(hud_county, anrf_county, by=0)
datatable(merfedata[1:25,-1],
rownames = FALSE,
options=list(pageLength = 5))
# Only select the variables that we want
df_filter<-df_phbfinal %>%
filter(!(Smoke_Free==""|is.na(Smoke_Free))) %>%
filter(!(BUILDING_TYPE_CODE=="ND")) %>%
dplyr::select(c(df_codebook$`Variable Name`,"Smoke_Free","INSPECTION_SCORE_NHPD_1",
"INSPECTION_SCORE_NHPD_2","INSPECTION_SCORE_NHPD_3", "PARTICIPANT_CODE","STATE_NAME",-"STATE2KX",
"SMOKEF_IMPLEMENT","Smoke_Free_ANRF","DATE_ANRF"))
We decide to only keep columns which have valid values greater than 90%.
# Find out null values by columns
df_null <- data.frame(lapply(df_filter,function(x) {length(which(!(is.na(x)|x==""|x==-4)))}))
df_null <- df_null %>%
bind_rows(summarise_if(., is.numeric, ~ .[1] / NROW(df_filter))) %>%
mutate(across(is.numeric, ~ round(., 3)))
df_null <- data.frame(t(df_null))
colnames(df_null)<-c('Number of Valid Values','Percentage')
# Filtering variables process
df_null<-df_null %>%
filter(Percentage>0.7)
df_null$Percentage<-percent(df_null$Percentage)
datatable(df_null[,])
col_lst<-colnames(t(df_null))
df_select<-df_filter %>%
dplyr::select(col_lst)
sapply(df_select, function(x) length(unique(x)))
## STD_ZIP11 CNTY2KX TRACT2KX
## 51626 120 2955
## HUD_TYPE BUILDING_TYPE_CODE UR
## 1 7 2
## CONSTRUCT_DATE TOTAL_DWELLING_UNITS TOTAL_OCCUPIED
## 2522 259 257
## YEAR_NHPD_1 YEAR_NHPD_2 YEAR_NHPD_3
## 11 13 15
## YEAR_18 YEAR_16 YEAR_15_1
## 7 6 5
## YEAR_11_1 Smoke_Free INSPECTION_SCORE_NHPD_1
## 5 2 102
## INSPECTION_SCORE_NHPD_2 INSPECTION_SCORE_NHPD_3 PARTICIPANT_CODE
## 92 97 612
## STATE_NAME SMOKEF_IMPLEMENT Smoke_Free_ANRF
## 49 17 3
## DATE_ANRF
## 16
Since we know that variable STD_ZIP11 is the unique 11 zip code Postnet Bar Code for each building, we will remove this variable. The HUD_TYPE variable only has one unique value, we will remove it as well. And we will also remove the Year of 18, 16, and 11_1 variables.
Extract the YEAR value from the CONSTRUCT_DATE column.
**Extract Inspection Scores AND scores with ’*’**
Definition of REAC Scores with different categories:
a: No health and safety deficiencies observed
b: One or more non-life threatening health and safety deficiencies observed
c: One or more health and safety deficiencies observed calling for immediate attention
’*’: Deficiencies found regarding smoke detectors
We also perform a correlation metric for all numeric variables of the final data set.
# Feature analysis
df_select$BUILDING_TYPE_CODE <- toupper(df_select$BUILDING_TYPE_CODE)
df_select$YEAR_CONSTRU <- as.numeric(format(as.Date(as.character(df_select$CONSTRUCT_DATE), format="%Y-%m-%d"),"%Y"))
# For NHPD data: EXTRACT inspection scores, inspection letter, and inspection results with *.
Letter_func<-function(input_col){
ifelse(grepl("a", input_col,fixed = TRUE),"a",
ifelse(grepl("b", input_col,fixed = TRUE),"b",
ifelse(grepl("c", input_col,fixed = TRUE),"c",NA)))
}
df_select$REAC_SCORE_NHPD1_LETTER <- Letter_func(input_col = df_select$INSPECTION_SCORE_NHPD_1)
df_select$REAC_SCORE_NHPD2_LETTER <- Letter_func(input_col = df_select$INSPECTION_SCORE_NHPD_2)
df_select$REAC_SCORE_NHPD3_LETTER <- Letter_func(input_col = df_select$INSPECTION_SCORE_NHPD_3)
df_select$REAC_SCORE_NHPD1 <- as.numeric(gsub("([0-9]+).*$", "\\1", df_select$INSPECTION_SCORE_NHPD_1))
df_select$REAC_SCORE_NHPD2 <- as.numeric(gsub("([0-9]+).*$", "\\1", df_select$INSPECTION_SCORE_NHPD_2))
df_select$REAC_SCORE_NHPD3 <- as.numeric(gsub("([0-9]+).*$", "\\1", df_select$INSPECTION_SCORE_NHPD_3))
df_select$REAC_SCORE_NHPD1_CODE <- ifelse(grepl('[^[:alnum:]]', df_select$INSPECTION_SCORE_NHPD_1),1,0)
df_select$REAC_SCORE_NHPD2_CODE <- ifelse(grepl('[^[:alnum:]]', df_select$INSPECTION_SCORE_NHPD_2),1,0)
df_select$REAC_SCORE_NHPD3_CODE <- ifelse(grepl('[^[:alnum:]]', df_select$INSPECTION_SCORE_NHPD_3),1,0)
df_try <- df_select %>%
dplyr::group_by(PARTICIPANT_CODE, Smoke_Free, STATE_NAME, BUILDING_TYPE_CODE,UR,REAC_SCORE_NHPD1_LETTER,REAC_SCORE_NHPD2_LETTER,REAC_SCORE_NHPD3_LETTER,
REAC_SCORE_NHPD1_CODE,REAC_SCORE_NHPD2_CODE,REAC_SCORE_NHPD3_CODE) %>%
dplyr::summarise(
TOTAL_DWELLING_UNITS=mean(TOTAL_DWELLING_UNITS, na.rm=TRUE),
YEAR_CONSTRU=mean(YEAR_CONSTRU, na.rm=TRUE),
REAC_SCORE_NHPD1=mean(REAC_SCORE_NHPD1, na.rm=TRUE),
REAC_SCORE_NHPD2=mean(REAC_SCORE_NHPD2, na.rm=TRUE),
REAC_SCORE_NHPD3=mean(REAC_SCORE_NHPD3, na.rm=TRUE))
df_try<-as.data.frame(df_try)
#knitr::kable(df_try[1:10,])
plot_missing(df_try)
YEAR_CONSTRUCT, DWELLING_UNITS, Total_Building_Type : at the building level (STD_ZIP11)
Inspection scores: at the county level
Urban/Rural Indicator (UR): at the census track level
# Find out null values by columns
df_state<-df_try %>% dplyr::group_by(STATE_NAME) %>% dplyr::summarise(Count = n())
df_state$Total<-nrow(df_try)
df_state$Percentage<-df_state$Count/df_state$Total
df_state <- df_state %>%
mutate(across(is.numeric, ~ round(., 3))) %>%
dplyr::select(STATE_NAME,Count,Percentage) %>%
arrange(-Percentage)
df_state$Percentage<-percent(df_state$Percentage)
datatable(df_state[,])
ggplot(data = df_try) +
geom_histogram(mapping = aes(x = REAC_SCORE_NHPD1),binwidth = 1) +
ggtitle("Distribution of Inspection Score 1") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df_try) +
geom_histogram(mapping = aes(x = REAC_SCORE_NHPD2),binwidth = 1) +
ggtitle("Distribution of Inspection Score 2") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
ggplot(data = df_try) +
geom_histogram(mapping = aes(x = REAC_SCORE_NHPD3),binwidth = 1) +
ggtitle("Distribution of Inspection Score 3") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
df_try<-df_try %>% mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))
# Convert the following two catergorical variables to factor
cols<-c("STATE_NAME","PARTICIPANT_CODE","BUILDING_TYPE_CODE","UR","REAC_SCORE_NHPD1_LETTER","REAC_SCORE_NHPD2_LETTER",
"REAC_SCORE_NHPD3_LETTER","REAC_SCORE_NHPD1_CODE","REAC_SCORE_NHPD2_CODE","REAC_SCORE_NHPD3_CODE")
df_try[cols] <- lapply(df_try[cols], factor)
# These are the final variables that to keep
df_final<-df_try %>%
dplyr::select(-REAC_SCORE_NHPD1_CODE, -REAC_SCORE_NHPD2_CODE, -REAC_SCORE_NHPD3_CODE,
-REAC_SCORE_NHPD1_LETTER, -REAC_SCORE_NHPD2_LETTER, -REAC_SCORE_NHPD3_LETTER,
-PARTICIPANT_CODE)
df_final$DWELLING_UNITS <- as.factor(ifelse(df_final$TOTAL_DWELLING_UNITS<=10, "0_10",
ifelse(df_final$TOTAL_DWELLING_UNITS<=25, "11_25",
ifelse(df_final$TOTAL_DWELLING_UNITS<=50, "26_50",
"51_or_More"))))
df_final$YEAR_CONSTRU_NEW <- as.factor(ifelse(df_final$YEAR_CONSTRU<=1960, "1960_or_Earlier",
ifelse(df_final$YEAR_CONSTRU<=1980, "1961_1980",
ifelse(df_final$YEAR_CONSTRU<=2000, "1981_2000",
"2001_or_Later"))))
df_final<-df_final %>% dplyr::select(-TOTAL_DWELLING_UNITS,-YEAR_CONSTRU)
datatable(df_final[,],
rownames = FALSE,
options=list(pageLength = 10))
table1(~factor(Smoke_Free)+BUILDING_TYPE_CODE+UR+REAC_SCORE_NHPD1+
REAC_SCORE_NHPD2+REAC_SCORE_NHPD3+factor(DWELLING_UNITS)+factor(YEAR_CONSTRU_NEW)
,row_wise = TRUE,test=TRUE,format_number = TRUE, data=df_final, overall="Total",digits=3)
| Total (N=1665) |
|
|---|---|
| factor(Smoke_Free) | |
| 0 | 1029 (61.8%) |
| 1 | 636 (38.2%) |
| BUILDING_TYPE_CODE | |
| ES | 352 (21.1%) |
| RW | 444 (26.7%) |
| SD | 356 (21.4%) |
| SF | 277 (16.6%) |
| WU | 236 (14.2%) |
| UR | |
| R | 257 (15.4%) |
| U | 1408 (84.6%) |
| REAC_SCORE_NHPD1 | |
| Mean (SD) | 82.9 (12.4) |
| Median [Min, Max] | 86.0 [8.00, 99.3] |
| REAC_SCORE_NHPD2 | |
| Mean (SD) | 85.4 (11.7) |
| Median [Min, Max] | 88.3 [17.0, 100] |
| REAC_SCORE_NHPD3 | |
| Mean (SD) | 85.2 (11.1) |
| Median [Min, Max] | 88.0 [27.0, 100] |
| factor(DWELLING_UNITS) | |
| 0_10 | 1200 (72.1%) |
| 11_25 | 99 (5.9%) |
| 26_50 | 111 (6.7%) |
| 51_or_More | 255 (15.3%) |
| factor(YEAR_CONSTRU_NEW) | |
| 1960_or_Earlier | 123 (7.4%) |
| 1961_1980 | 1061 (63.7%) |
| 1981_2000 | 393 (23.6%) |
| 2001_or_Later | 88 (5.3%) |
# Display correlation metric
res2 <- df_final %>% dplyr::select(REAC_SCORE_NHPD1,REAC_SCORE_NHPD2,REAC_SCORE_NHPD3,Smoke_Free)
res3 <- cor(res2)
knitr::kable(res3[,])
| REAC_SCORE_NHPD1 | REAC_SCORE_NHPD2 | REAC_SCORE_NHPD3 | Smoke_Free | |
|---|---|---|---|---|
| REAC_SCORE_NHPD1 | 1.0000000 | 0.4878180 | 0.4440660 | -0.0873812 |
| REAC_SCORE_NHPD2 | 0.4878180 | 1.0000000 | 0.5127785 | -0.1022394 |
| REAC_SCORE_NHPD3 | 0.4440660 | 0.5127785 | 1.0000000 | -0.0804665 |
| Smoke_Free | -0.0873812 | -0.1022394 | -0.0804665 | 1.0000000 |
## Correlation Metric
rquery.cormat(res2)
## $r
## Smoke_Free REAC_SCORE_NHPD1 REAC_SCORE_NHPD2 REAC_SCORE_NHPD3
## Smoke_Free 1
## REAC_SCORE_NHPD1 -0.087 1
## REAC_SCORE_NHPD2 -0.1 0.49 1
## REAC_SCORE_NHPD3 -0.08 0.44 0.51 1
##
## $p
## Smoke_Free REAC_SCORE_NHPD1 REAC_SCORE_NHPD2 REAC_SCORE_NHPD3
## Smoke_Free 0
## REAC_SCORE_NHPD1 0.00036 0
## REAC_SCORE_NHPD2 2.9e-05 2.9e-100 0
## REAC_SCORE_NHPD3 0.001 2.1e-81 2.6e-112 0
##
## $sym
## Smoke_Free REAC_SCORE_NHPD1 REAC_SCORE_NHPD2 REAC_SCORE_NHPD3
## Smoke_Free 1
## REAC_SCORE_NHPD1 1
## REAC_SCORE_NHPD2 . 1
## REAC_SCORE_NHPD3 . . 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
u_r<-df_final %>% group_by(UR) %>% summarise(Count = n())
knitr::kable(u_r[,])
| UR | Count |
|---|---|
| R | 257 |
| U | 1408 |
ggplot(u_r, aes(x=UR,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("UR") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
ES=Elevator Structure
RW=Row or Townhouse
SD=Semi-Detached
SF=Single Family/Detached(duplex)
WU=Walk-Up/Multifamily Apt.
Use door numbers in Unit Spreadsheet (table).
htype<-df_final %>% group_by(BUILDING_TYPE_CODE) %>% summarise(Count = n())
knitr::kable(htype[,])
| BUILDING_TYPE_CODE | Count |
|---|---|
| ES | 352 |
| RW | 444 |
| SD | 356 |
| SF | 277 |
| WU | 236 |
ggplot(htype, aes(x=BUILDING_TYPE_CODE,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("BUILDING TYPE CODE") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
# df_final$YEAR_CONSTRU_NEW<-factor(df_final$YEAR_CONSTRU_NEW, levels = c("1960_or_Earlier",
# "1960_1980",
# "1980_2000",
# "2000_or_Later"))
year_built<-df_final %>% group_by(YEAR_CONSTRU_NEW) %>% summarise(Count = n())
knitr::kable(year_built[,])
| YEAR_CONSTRU_NEW | Count |
|---|---|
| 1960_or_Earlier | 123 |
| 1961_1980 | 1061 |
| 1981_2000 | 393 |
| 2001_or_Later | 88 |
ggplot(year_built, aes(x=YEAR_CONSTRU_NEW,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("Construction Year") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
dwelling_unit<-df_final %>% group_by(DWELLING_UNITS) %>% summarise(Count = n())
knitr::kable(dwelling_unit[,])
| DWELLING_UNITS | Count |
|---|---|
| 0_10 | 1200 |
| 11_25 | 99 |
| 26_50 | 111 |
| 51_or_More | 255 |
ggplot(dwelling_unit, aes(x=DWELLING_UNITS,y=Count)) +
geom_col(fill="#69b3a2") +
xlab("Dwelling Units") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
geom_text(aes(label=round(Count/NROW(df_final),3)), position = position_dodge(0.9), vjust=-0.25)
In this section, we will perform One-Hot Encoding, train-test-split the data, and implement XGBoost model.
This ML model is used for better selecting feature variables to the final model: GLM
# XGBoost model
# Create Training Data
input_ones <- df_final[which(df_final$Smoke_Free == 1), ] # all 1's
input_zeros <- df_final[which(df_final$Smoke_Free == 0), ] # all 0's
# for repeatability of samples
set.seed(314)
input_ones_training_rows <- sample(1:nrow(input_ones), 0.7*nrow(input_ones))
input_zeros_training_rows <- sample(1:nrow(input_zeros), 0.7*nrow(input_ones))
training_ones <- input_ones[input_ones_training_rows, ]
training_zeros <- input_zeros[input_zeros_training_rows, ]
# row bind the 1's and 0's
trainingData <- rbind(training_ones, training_zeros)
# Create Test Data
test_ones <- input_ones[-input_ones_training_rows, ]
test_zeros <- input_zeros[-input_zeros_training_rows, ]
# row bind the 1's and 0's
testData <- rbind(test_ones, test_zeros)
# Finally, output the three dataframes for training, validation and test.
dfTraining_x <- one_hot(as.data.table(trainingData %>% dplyr::select(-Smoke_Free)))#,-Weight)))
dfTraining <- cbind(dfTraining_x, as.numeric(as.character(trainingData$Smoke_Free))) %>% dplyr::rename(Smoke_Free = V2)
dfTest_x <- one_hot(as.data.table(testData%>% dplyr::select(-Smoke_Free)))#,-Weight)))
dfTest <- cbind(dfTest_x, as.numeric(as.character(testData$Smoke_Free))) %>% dplyr::rename(Smoke_Free = V2)
# Conduct DMatrix
xg.train <- xgb.DMatrix(data = as.matrix(dfTraining %>% dplyr::select(-Smoke_Free)),
label = dfTraining$Smoke_Free)
#,weight = dfTraining$Weight)
xg.test <- xgb.DMatrix(data = as.matrix(dfTest %>% dplyr::select(-Smoke_Free)),
label = dfTest$Smoke_Free)
#weight = dfTest$Weight)
set.seed(314)
bst <- xgb.train(data = xg.train,
objective = "binary:logistic",
nrounds = 1000,
eta = 0.02,
max_depth = 10,
min_child_weight = 10)
pred <- predict(bst, xg.test)
prediction <- as.numeric(pred > 0.5)
err <- mean(as.numeric(pred > 0.5) != testData$Smoke_Free)
The length of the prediction is 775.
Show the fist 5 results of the prediction: 1, 1, 0, 1, 1, 1.
The overall error from the test data set is: 0.3406452.
# Importance Matrix
importance_matrix <- xgb.importance(model = bst)
print(importance_matrix)
## Feature Gain Cover Frequency
## 1: REAC_SCORE_NHPD2 0.2806893642 0.2654202498 0.288877953
## 2: REAC_SCORE_NHPD1 0.2504051269 0.2806568945 0.298146325
## 3: REAC_SCORE_NHPD3 0.2198485577 0.2674097341 0.278215223
## 4: STATE_NAME_Michigan 0.0602136533 0.0093176950 0.003526903
## 5: STATE_NAME_Illinois 0.0384106244 0.0142359199 0.005987533
## 6: STATE_NAME_Massachusetts 0.0234745332 0.0249762782 0.011646982
## 7: STATE_NAME_New_York 0.0232367671 0.0087134277 0.003937008
## 8: UR_R 0.0148097272 0.0171963615 0.012303150
## 9: YEAR_CONSTRU_NEW_1961_1980 0.0142873641 0.0125720997 0.019438976
## 10: DWELLING_UNITS_0_10 0.0140384140 0.0093540753 0.011154856
## 11: YEAR_CONSTRU_NEW_1981_2000 0.0107501017 0.0142215667 0.015748031
## 12: BUILDING_TYPE_CODE_RW 0.0098951988 0.0081234990 0.010088583
## 13: YEAR_CONSTRU_NEW_2001_or_Later 0.0098573921 0.0116711043 0.004839239
## 14: BUILDING_TYPE_CODE_SF 0.0055779767 0.0137144784 0.008694226
## 15: STATE_NAME_Nebraska 0.0052091302 0.0042672699 0.001804462
## 16: STATE_NAME_Pennsylvania 0.0046066827 0.0075799831 0.003198819
## 17: BUILDING_TYPE_CODE_ES 0.0045467714 0.0051065199 0.006069554
## 18: BUILDING_TYPE_CODE_WU 0.0037663657 0.0047223438 0.004593176
## 19: DWELLING_UNITS_26_50 0.0016304650 0.0085771791 0.004347113
## 20: DWELLING_UNITS_51_or_More 0.0010409857 0.0015103062 0.001722441
## 21: BUILDING_TYPE_CODE_SD 0.0010364458 0.0007773171 0.000902231
## 22: DWELLING_UNITS_11_25 0.0010007823 0.0025356355 0.001230315
## 23: YEAR_CONSTRU_NEW_1960_or_Earlier 0.0009610014 0.0029012272 0.001558399
## 24: STATE_NAME_California 0.0007065683 0.0044388342 0.001968504
## Feature Gain Cover Frequency
xgb.plot.importance(importance_matrix = importance_matrix %>% head(30))
# Evaluation Model
# Calculate RMSE
residuals = dfTest$Smoke_Free - pred
RMSE = sqrt(mean(residuals**2))
The RMSE is: 0.4621.
CM<-sum(ifelse(as.numeric(pred>0.5)==dfTest$Smoke_Free, 1, 0)) / length(pred)
The overall rate of accuracy is: 0.6594.
# Calibration Plot
response<-ifelse(dfTest$Smoke_Free==1, "Does Not have Smoke Free Policy", "Has Smoke Free Policy")
testProbs<-data.frame(obs = as.factor(response), lr = pred)
calplotdata<-calibration(obs ~ lr, data = testProbs)
xyplot(calplotdata, auto.key = list(columns = 2))
top_30 <- (importance_matrix %>% head(30))$Feature
glm_function <- as.formula(paste("Smoke_Free ~ ",paste(top_30, collapse = "+")))
logitMod <-glm(glm_function,data=dfTraining, family=binomial)
summary(logitMod)
##
## Call:
## glm(formula = glm_function, family = binomial, data = dfTraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.25725 -1.13074 0.09743 1.15350 2.36845
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1817570 0.7824058 -0.232 0.816301
## REAC_SCORE_NHPD2 0.0018686 0.0079813 0.234 0.814894
## REAC_SCORE_NHPD1 -0.0064189 0.0069277 -0.927 0.354155
## REAC_SCORE_NHPD3 -0.0008392 0.0080463 -0.104 0.916933
## STATE_NAME_Michigan -2.7128044 0.6106418 -4.443 8.89e-06 ***
## STATE_NAME_Illinois 1.7651564 0.3867541 4.564 5.02e-06 ***
## STATE_NAME_Massachusetts 1.1393955 0.3329585 3.422 0.000622 ***
## STATE_NAME_New_York 1.5592723 0.3963086 3.934 8.34e-05 ***
## UR_R -0.3733475 0.2280513 -1.637 0.101605
## YEAR_CONSTRU_NEW_1961_1980 -0.0604525 0.2879200 -0.210 0.833697
## DWELLING_UNITS_0_10 0.7787091 0.3878083 2.008 0.044646 *
## YEAR_CONSTRU_NEW_1981_2000 -0.0820695 0.3073157 -0.267 0.789429
## BUILDING_TYPE_CODE_RW -0.1724674 0.2130172 -0.810 0.418147
## YEAR_CONSTRU_NEW_2001_or_Later 0.7202084 0.4363586 1.650 0.098841 .
## BUILDING_TYPE_CODE_SF -0.1744096 0.2343850 -0.744 0.456806
## STATE_NAME_Nebraska -0.7111278 0.3498749 -2.033 0.042101 *
## STATE_NAME_Pennsylvania -0.7051241 0.3311262 -2.129 0.033215 *
## BUILDING_TYPE_CODE_ES 1.2523746 0.6074054 2.062 0.039223 *
## BUILDING_TYPE_CODE_WU 0.1358777 0.2941901 0.462 0.644174
## DWELLING_UNITS_26_50 -0.5923533 0.5896873 -1.005 0.315128
## DWELLING_UNITS_51_or_More -0.5958083 0.5924598 -1.006 0.314583
## BUILDING_TYPE_CODE_SD NA NA NA NA
## DWELLING_UNITS_11_25 NA NA NA NA
## YEAR_CONSTRU_NEW_1960_or_Earlier NA NA NA NA
## STATE_NAME_California -0.1514219 0.3163130 -0.479 0.632146
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1233.8 on 889 degrees of freedom
## Residual deviance: 1089.7 on 868 degrees of freedom
## AIC: 1133.7
##
## Number of Fisher Scoring iterations: 5
set.seed(314)
logitMod.full <- glm(Smoke_Free ~ REAC_SCORE_NHPD1 + REAC_SCORE_NHPD2 + REAC_SCORE_NHPD3 +
STATE_NAME_Michigan + STATE_NAME_Illinois + STATE_NAME_Massachusetts +
STATE_NAME_New_York + DWELLING_UNITS_0_10 + UR_R + BUILDING_TYPE_CODE_SF +
BUILDING_TYPE_CODE_RW + STATE_NAME_Pennsylvania + STATE_NAME_Nebraska +
BUILDING_TYPE_CODE_ES + BUILDING_TYPE_CODE_SD + STATE_NAME_California,
data=dfTraining, family=binomial)
summary(logitMod.full)
##
## Call:
## glm(formula = Smoke_Free ~ REAC_SCORE_NHPD1 + REAC_SCORE_NHPD2 +
## REAC_SCORE_NHPD3 + STATE_NAME_Michigan + STATE_NAME_Illinois +
## STATE_NAME_Massachusetts + STATE_NAME_New_York + DWELLING_UNITS_0_10 +
## UR_R + BUILDING_TYPE_CODE_SF + BUILDING_TYPE_CODE_RW + STATE_NAME_Pennsylvania +
## STATE_NAME_Nebraska + BUILDING_TYPE_CODE_ES + BUILDING_TYPE_CODE_SD +
## STATE_NAME_California, family = binomial, data = dfTraining)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1372 -1.1428 0.1241 1.1394 2.3397
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1816352 0.6860408 -0.265 0.791195
## REAC_SCORE_NHPD1 -0.0062461 0.0069029 -0.905 0.365545
## REAC_SCORE_NHPD2 0.0015035 0.0079507 0.189 0.850018
## REAC_SCORE_NHPD3 -0.0009516 0.0079916 -0.119 0.905216
## STATE_NAME_Michigan -2.6916641 0.6054683 -4.446 8.77e-06 ***
## STATE_NAME_Illinois 1.7645859 0.3848223 4.585 4.53e-06 ***
## STATE_NAME_Massachusetts 1.1191262 0.3314736 3.376 0.000735 ***
## STATE_NAME_New_York 1.5672203 0.3911608 4.007 6.16e-05 ***
## DWELLING_UNITS_0_10 0.9617505 0.3647290 2.637 0.008367 **
## UR_R -0.3488157 0.2258292 -1.545 0.122443
## BUILDING_TYPE_CODE_SF -0.3345835 0.2981555 -1.122 0.261787
## BUILDING_TYPE_CODE_RW -0.3645670 0.2782376 -1.310 0.190104
## STATE_NAME_Pennsylvania -0.7260335 0.3295247 -2.203 0.027575 *
## STATE_NAME_Nebraska -0.7552164 0.3481935 -2.169 0.030086 *
## BUILDING_TYPE_CODE_ES 0.6735963 0.3356234 2.007 0.044750 *
## BUILDING_TYPE_CODE_SD -0.1704616 0.2910535 -0.586 0.558097
## STATE_NAME_California -0.1590174 0.3143628 -0.506 0.612969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1233.8 on 889 degrees of freedom
## Residual deviance: 1096.1 on 873 degrees of freedom
## AIC: 1130.1
##
## Number of Fisher Scoring iterations: 5
coef(logitMod.full)
## (Intercept) REAC_SCORE_NHPD1 REAC_SCORE_NHPD2
## -0.1816352400 -0.0062460987 0.0015034529
## REAC_SCORE_NHPD3 STATE_NAME_Michigan STATE_NAME_Illinois
## -0.0009516058 -2.6916641140 1.7645858709
## STATE_NAME_Massachusetts STATE_NAME_New_York DWELLING_UNITS_0_10
## 1.1191261901 1.5672203306 0.9617505112
## UR_R BUILDING_TYPE_CODE_SF BUILDING_TYPE_CODE_RW
## -0.3488156788 -0.3345835364 -0.3645669554
## STATE_NAME_Pennsylvania STATE_NAME_Nebraska BUILDING_TYPE_CODE_ES
## -0.7260335217 -0.7552164115 0.6735963428
## BUILDING_TYPE_CODE_SD STATE_NAME_California
## -0.1704616261 -0.1590174213
Since log(odds) are hard to interpret, we will transform it by exponentiating the outcome as follow
exp(coef(logitMod.full))
## (Intercept) REAC_SCORE_NHPD1 REAC_SCORE_NHPD2
## 0.83390546 0.99377337 1.00150458
## REAC_SCORE_NHPD3 STATE_NAME_Michigan STATE_NAME_Illinois
## 0.99904885 0.06776807 5.83915369
## STATE_NAME_Massachusetts STATE_NAME_New_York DWELLING_UNITS_0_10
## 3.06217727 4.79330585 2.61627228
## UR_R BUILDING_TYPE_CODE_SF BUILDING_TYPE_CODE_RW
## 0.70552316 0.71563606 0.69449733
## STATE_NAME_Pennsylvania STATE_NAME_Nebraska BUILDING_TYPE_CODE_ES
## 0.48382427 0.46990891 1.96127808
## BUILDING_TYPE_CODE_SD STATE_NAME_California
## 0.84327545 0.85298150
Misclassification error is the percentage mismatch of predcited vs actuals, irrespective of 1’s or 0’s. The lower the misclassification error, the better is the model.
# predicted scores
predicted <- logitMod.full %>% predict(dfTest,type = "response")
optCutOff <- optimalCutoff(dfTest$Smoke_Free, predicted,
#optimiseFor = "Both",
returnDiagnostics = FALSE)
model = pscl::pR2(logitMod.full)["McFadden"]
miss<-misClassError(dfTest$Smoke_Free, predicted, threshold = optCutOff)
The optimal CutOff value is: 0.7846.
The McFadden is: 0.1116.
The Misclassification Error is: 0.2323.
Concordance(testData$Smoke_Free, predicted)
## $Concordance
## [1] 0.7046816
##
## $Discordance
## [1] 0.2953184
##
## $Tied
## [1] 0
##
## $Pairs
## [1] 111544
predicted.classes <- ifelse(predicted > optCutOff, 1, 0)
m_acc<-mean(predicted.classes == dfTest$Smoke_Free)
table(predicted.classes, dfTest$Smoke_Free)
##
## predicted.classes 0 1
## 0 558 154
## 1 26 37
It is drawn by the False positive rate on X-axis and the True Positive rate on Y-axis.
It considers all possible threshold values and corresponds specificity and sensitivity rate, reflecting the final model accuracy.
plotROC(testData$Smoke_Free, predicted)