Part 2-Project

Objective

The primary objective of this exercise is to provide descriptive statistics for the project and to prepare the data for further statistical analysis. This process involves cleaning and preparing the data, as well as combining several sources of data for descriptive and subsequent statistical analysis.

Dependent Variables: Coronary heart disease and four related factors

setwd("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Folder")
data1 <- read.csv("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Folder\\DATA_PROJECT.csv")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data2<-data1%>%
filter(StateAbbr=="TX")
names(data2)

##  [1] "StateAbbr"          "TractFIPS"          "TotalPopulation"   
##  [4] "BPHIGH_CrudePrev"   "CHD_CrudePrev"      "DIABETES_CrudePrev"
##  [7] "HIGHCHOL_CrudePrev" "OBESITY_CrudePrev"  "Geolocation"       
## [10] "States"

data3<-subset(data2, select=c(TractFIPS,BPHIGH_CrudePrev, CHD_CrudePrev,DIABETES_CrudePrev,HIGHCHOL_CrudePrev,OBESITY_CrudePrev))
names(data3)

## [1] "TractFIPS"          "BPHIGH_CrudePrev"   "CHD_CrudePrev"     
## [4] "DIABETES_CrudePrev" "HIGHCHOL_CrudePrev" "OBESITY_CrudePrev"

nrow(data3)

## [1] 5222

data3%>%
na.omit(data3)
nrow(data3)

The data contains 5222 observations for Texas census tracts and there is no missing value in 5222 entries. However, the total number of census tracts in Texas is 5265.

Independent variable: Walkability Index

setwd("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Folder")
dataA <- read.csv("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Folder\\WalkabilityIndex_Tract_2019.csv")

library(dplyr)
dataB<-dataA%>%
filter(StAbbr=="TX")
dataB

nrow(dataB)

## [1] 5265

The data contains 5265 walkability index for Texas census tracts and the total number of census tracts in Texas is 5265.

Calculating walkability index for Texas census tracts

library(ggplot2)
dataB$rank1 <- as.numeric(cut_number(dataB$D2A_EPHHM,20))
dataB$rank2 <- as.numeric(cut_number(dataB$D2B_E8MIXA,20))
dataB$rank3 <- as.numeric(cut_number(dataB$D3B,20))

library(dplyr)
dataC<-subset(dataB, select=c(TractID2019,Pop2018,rank1,rank2,rank3,D4A,NatWalkInd)) %>%
mutate(d4A =ifelse(D4A ==-99999.00,1, D4A))
dataC

dataC$rank4 <- as.numeric( cut(dataC$d4A,20, labels=F))

library(dplyr)
dataD<-dataC %>%
mutate(WI=(rank1/6)+(rank2/6)+(rank3/3)+(rank4/3))
names(dataD)

##  [1] "TractID2019" "Pop2018"     "rank1"       "rank2"       "rank3"      
##  [6] "D4A"         "NatWalkInd"  "d4A"         "rank4"       "WI"

library(dplyr)
dataE<-subset(dataD, select=c(TractID2019,WI)) %>%
group_by(TractID2019)
names(dataE)

## [1] "TractID2019" "WI"

Control Variables

file1<- read.csv("C:\\Users\\anami\\OneDrive\\Documents\\Stat ll\\Folder\\ACS2019.csv")

file2<-subset(file1, select=c(geoid,totpop,fpop,mage,hispan,nh_white,nh_black,nh_asian))
names(file2)

## [1] "geoid"    "totpop"   "fpop"     "mage"     "hispan"   "nh_white" "nh_black"
## [8] "nh_asian"

nrow(file2)

## [1] 5265

Combining datasets:

combineA <-data3 %>%
  left_join(dataE, by = c("TractFIPS" = "TractID2019")) %>%
  select(TractFIPS,WI, BPHIGH_CrudePrev,CHD_CrudePrev,OBESITY_CrudePrev,DIABETES_CrudePrev,HIGHCHOL_CrudePrev)
names(combineA)

## [1] "TractFIPS"          "WI"                 "BPHIGH_CrudePrev"  
## [4] "CHD_CrudePrev"      "OBESITY_CrudePrev"  "DIABETES_CrudePrev"
## [7] "HIGHCHOL_CrudePrev"

combineB <-combineA %>%
  left_join(file2, by = c("TractFIPS" = "geoid")) %>%
  select(TractFIPS,WI, BPHIGH_CrudePrev,CHD_CrudePrev,OBESITY_CrudePrev,DIABETES_CrudePrev,HIGHCHOL_CrudePrev,totpop,fpop,mage,hispan,nh_white,nh_black,nh_asian)
names(combineB)

##  [1] "TractFIPS"          "WI"                 "BPHIGH_CrudePrev"  
##  [4] "CHD_CrudePrev"      "OBESITY_CrudePrev"  "DIABETES_CrudePrev"
##  [7] "HIGHCHOL_CrudePrev" "totpop"             "fpop"              
## [10] "mage"               "hispan"             "nh_white"          
## [13] "nh_black"           "nh_asian"

Variables description

library(ggplot2)
combineB$group <- as.numeric(cut_number(combineB$WI, 4))
combineB$group<-factor(combineB$group, levels = c(1, 2, 3,4), 
                      labels = c("Q1", "Q2", "Q3","Q4"))
table(combineB$group)

## 
##   Q1   Q2   Q3   Q4 
## 1332 1346 1246 1298

library(dplyr)
final1<-subset(combineB, select=c(WI,totpop,fpop,mage,hispan,nh_white,nh_black,nh_asian,group))%>%
  mutate(percentfemale=(fpop*100)/totpop)
names(final1)

##  [1] "WI"            "totpop"        "fpop"          "mage"         
##  [5] "hispan"        "nh_white"      "nh_black"      "nh_asian"     
##  [9] "group"         "percentfemale"

final1$mage <- as.numeric(final1$mage)
final1$hispan <- as.numeric(final1$hispan)
final1$nh_white <- as.numeric(final1$nh_white)
final1$nh_black <- as.numeric(final1$nh_black)
final1$nh_asian <- as.numeric(final1$nh_asian)

library(dplyr)
final2<-subset(final1, select=c(WI,percentfemale,mage,hispan,nh_white,nh_black,nh_asian,group))%>%
group_by(group)
names(final2)

## [1] "WI"            "percentfemale" "mage"          "hispan"       
## [5] "nh_white"      "nh_black"      "nh_asian"      "group"

library(arsenal) 
table_1 <- tableby(group ~ ., data =final1) 
summary(table_1, title = "Descriptive Analysis I")

## 
## 
## Table: Descriptive Analysis I
## 
## |                            |     Q1 (N=1332)     |     Q2 (N=1346)     |     Q3 (N=1246)     |     Q4 (N=1298)     |   Total (N=5222)    | p value|
## |:---------------------------|:-------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|-------:|
## |**WI**                      |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |    4.273 (1.175)    |    7.222 (0.693)    |    9.505 (0.658)    |   12.629 (1.419)    |    8.358 (3.243)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |    1.000 - 6.000    |    6.000 - 8.333    |   8.500 - 10.667    |   10.833 - 19.000   |   1.000 - 19.000    |        |
## |**totpop**                  |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) | 5333.612 (3307.716) | 6087.311 (4283.567) | 5358.671 (3096.476) | 4842.710 (2545.362) | 5411.841 (3407.698) |        |
## |&nbsp;&nbsp;&nbsp;Range     | 37.000 - 35550.000  | 16.000 - 72041.000  | 690.000 - 49119.000 | 454.000 - 41538.000 | 16.000 - 72041.000  |        |
## |**fpop**                    |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) | 2641.559 (1698.413) | 3077.238 (2209.441) | 2721.681 (1594.162) | 2446.108 (1323.460) | 2724.393 (1757.038) |        |
## |&nbsp;&nbsp;&nbsp;Range     |  0.000 - 18773.000  |  0.000 - 37626.000  | 288.000 - 26376.000 | 220.000 - 21646.000 |  0.000 - 37626.000  |        |
## |**mage**                    |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   38.267 (7.862)    |   35.750 (6.367)    |   34.822 (5.957)    |   35.503 (6.186)    |   36.109 (6.773)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |   18.800 - 73.700   |   19.200 - 60.300   |   19.300 - 64.700   |   19.800 - 61.700   |   18.800 - 73.700   |        |
## |**hispan**                  |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   33.441 (28.103)   |   38.595 (26.960)   |   44.914 (28.211)   |   41.608 (27.779)   |   39.537 (28.067)   |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 100.000   |   0.600 - 100.000   |   0.000 - 100.000   |   0.000 - 99.900    |   0.000 - 100.000   |        |
## |**nh_white**                |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   54.212 (28.551)   |   42.359 (26.463)   |   34.538 (25.284)   |   38.847 (25.508)   |   42.643 (27.490)   |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 100.000   |   0.000 - 94.400    |   0.000 - 89.800    |   0.000 - 98.400    |   0.000 - 100.000   |        |
## |**nh_black**                |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   8.759 (13.499)    |   12.935 (16.512)   |   14.032 (17.724)   |   11.219 (13.401)   |   11.705 (15.497)   |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 95.600    |   0.000 - 96.800    |   0.000 - 92.100    |   0.000 - 81.300    |   0.000 - 96.800    |        |
## |**nh_asian**                |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |    1.520 (3.444)    |    3.956 (6.988)    |    4.533 (7.174)    |    6.041 (8.705)    |    3.991 (7.027)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 42.800    |   0.000 - 61.200    |   0.000 - 54.700    |   0.000 - 75.800    |   0.000 - 75.800    |        |
## |**percentfemale**           |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   49.449 (6.519)    |   50.423 (4.538)    |   50.816 (3.939)    |   50.389 (4.000)    |   50.260 (4.906)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 82.975    |   0.000 - 63.081    |   17.187 - 71.375   |   26.809 - 68.496   |   0.000 - 82.975    |        |

final3<-subset(combineB, select=c(BPHIGH_CrudePrev,CHD_CrudePrev,OBESITY_CrudePrev,DIABETES_CrudePrev,HIGHCHOL_CrudePrev,group))
names(final3)

## [1] "BPHIGH_CrudePrev"   "CHD_CrudePrev"      "OBESITY_CrudePrev" 
## [4] "DIABETES_CrudePrev" "HIGHCHOL_CrudePrev" "group"

library(arsenal) 
table_2 <- tableby(group ~ ., data =final1) 
summary(table_2, title = "Descriptive Analysis II")

## 
## 
## Table: Descriptive Analysis II
## 
## |                            |     Q1 (N=1332)     |     Q2 (N=1346)     |     Q3 (N=1246)     |     Q4 (N=1298)     |   Total (N=5222)    | p value|
## |:---------------------------|:-------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|-------:|
## |**WI**                      |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |    4.273 (1.175)    |    7.222 (0.693)    |    9.505 (0.658)    |   12.629 (1.419)    |    8.358 (3.243)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |    1.000 - 6.000    |    6.000 - 8.333    |   8.500 - 10.667    |   10.833 - 19.000   |   1.000 - 19.000    |        |
## |**totpop**                  |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) | 5333.612 (3307.716) | 6087.311 (4283.567) | 5358.671 (3096.476) | 4842.710 (2545.362) | 5411.841 (3407.698) |        |
## |&nbsp;&nbsp;&nbsp;Range     | 37.000 - 35550.000  | 16.000 - 72041.000  | 690.000 - 49119.000 | 454.000 - 41538.000 | 16.000 - 72041.000  |        |
## |**fpop**                    |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) | 2641.559 (1698.413) | 3077.238 (2209.441) | 2721.681 (1594.162) | 2446.108 (1323.460) | 2724.393 (1757.038) |        |
## |&nbsp;&nbsp;&nbsp;Range     |  0.000 - 18773.000  |  0.000 - 37626.000  | 288.000 - 26376.000 | 220.000 - 21646.000 |  0.000 - 37626.000  |        |
## |**mage**                    |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   38.267 (7.862)    |   35.750 (6.367)    |   34.822 (5.957)    |   35.503 (6.186)    |   36.109 (6.773)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |   18.800 - 73.700   |   19.200 - 60.300   |   19.300 - 64.700   |   19.800 - 61.700   |   18.800 - 73.700   |        |
## |**hispan**                  |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   33.441 (28.103)   |   38.595 (26.960)   |   44.914 (28.211)   |   41.608 (27.779)   |   39.537 (28.067)   |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 100.000   |   0.600 - 100.000   |   0.000 - 100.000   |   0.000 - 99.900    |   0.000 - 100.000   |        |
## |**nh_white**                |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   54.212 (28.551)   |   42.359 (26.463)   |   34.538 (25.284)   |   38.847 (25.508)   |   42.643 (27.490)   |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 100.000   |   0.000 - 94.400    |   0.000 - 89.800    |   0.000 - 98.400    |   0.000 - 100.000   |        |
## |**nh_black**                |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   8.759 (13.499)    |   12.935 (16.512)   |   14.032 (17.724)   |   11.219 (13.401)   |   11.705 (15.497)   |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 95.600    |   0.000 - 96.800    |   0.000 - 92.100    |   0.000 - 81.300    |   0.000 - 96.800    |        |
## |**nh_asian**                |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |    1.520 (3.444)    |    3.956 (6.988)    |    4.533 (7.174)    |    6.041 (8.705)    |    3.991 (7.027)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 42.800    |   0.000 - 61.200    |   0.000 - 54.700    |   0.000 - 75.800    |   0.000 - 75.800    |        |
## |**percentfemale**           |                     |                     |                     |                     |                     | < 0.001|
## |&nbsp;&nbsp;&nbsp;Mean (SD) |   49.449 (6.519)    |   50.423 (4.538)    |   50.816 (3.939)    |   50.389 (4.000)    |   50.260 (4.906)    |        |
## |&nbsp;&nbsp;&nbsp;Range     |   0.000 - 82.975    |   0.000 - 63.081    |   17.187 - 71.375   |   26.809 - 68.496   |   0.000 - 82.975    |        |

Mapping

library(tigris)

## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.

library(leaflet)
tx_censustracts <- tracts(state = "TX")

## Retrieving data for the year 2020

tx_censustracts_cb <- tracts("TX", cb = TRUE)

## Retrieving data for the year 2020

plot(tx_censustracts$geometry)

plot(tx_censustracts_cb$geometry)

dmap <- tx_censustracts_cb %>%  
    mutate(GEOID = as.numeric(GEOID))

dmap1 <-dmap %>%
  left_join(combineB, by = c("GEOID"="TractFIPS")) %>%
  select(GEOID, BPHIGH_CrudePrev,CHD_CrudePrev,OBESITY_CrudePrev,DIABETES_CrudePrev,HIGHCHOL_CrudePrev)
names(dmap1)

## [1] "GEOID"              "BPHIGH_CrudePrev"   "CHD_CrudePrev"     
## [4] "OBESITY_CrudePrev"  "DIABETES_CrudePrev" "HIGHCHOL_CrudePrev"
## [7] "geometry"

library(tmap)
tm_shape(dmap1) + 
  tm_fill(col = "BPHIGH_CrudePrev", style = "quantile",n=4,palette = "Blues", 
          title = "% High BP")

tm_shape(dmap1) + 
  tm_fill(col = "CHD_CrudePrev", style = "quantile",n=4,palette = "Reds", 
          title = "% Coronary Artery Disease")

tm_shape(dmap1) + 
  tm_fill(col = "DIABETES_CrudePrev", style = "quantile",n=4,palette = "Greens", 
          title = "% of Diabetes")

tm_shape(dmap1) + 
  tm_fill(col = "OBESITY_CrudePrev", style = "quantile",n=4,palette = "Oranges", 
          title = "% of Obesity")

tm_shape(dmap1) + 
  tm_fill(col = "HIGHCHOL_CrudePrev", style = "quantile",n=4, palette = "Purples", 
          title = "% of High Cholestrol")