Health Data in the United States (2016)

I have selected the U.S. Health Data (2016) file from Social Explorer to examine further. I will be examining the number of people that do not have health insurance. Which region of the United States has the highest number of uninsured residents?

Importing Data

library(readr)
healthdata <- read_csv("/Users/rachel_ramphal/Documents/healthdata.csv")

Data Preview

library(knitr)
head(healthdata)
## # A tibble: 6 x 51
##   Geo_FIPS Geo_NAME Geo_QNAME Geo_STATE Geo_COUNTY SE_T001_001 SE_T001_002
##   <chr>    <chr>    <chr>     <chr>     <chr>            <dbl>       <dbl>
## 1 01001    Autauga… Autauga … 01        001                4.4         4.3
## 2 04001    Apache … Apache C… 04        001                5.7         5.3
## 3 05001    Arkansa… Arkansas… 05        001                4.6         4.2
## 4 06001    Alameda… Alameda … 06        001                3.4         3.3
## 5 08001    Adams C… Adams Co… 08        001                3.5         3.3
## 6 09001    Fairfie… Fairfiel… 09        001                2.8         3.2
## # … with 44 more variables: SE_T002_001 <dbl>, SE_T003_001 <dbl>,
## #   SE_NV002_001 <dbl>, SE_T004_001 <dbl>, SE_T004_002 <dbl>,
## #   SE_T004_003 <dbl>, SE_NV003_001 <dbl>, SE_NV003_002 <dbl>,
## #   SE_NV003_003 <dbl>, SE_T005_001 <dbl>, SE_T006_001 <dbl>,
## #   SE_T006_002 <dbl>, SE_T006_003 <dbl>, SE_NV005_001 <dbl>,
## #   SE_NV005_002 <dbl>, SE_NV005_003 <dbl>, SE_T007_001 <dbl>,
## #   SE_T007_002 <dbl>, SE_T008_001 <dbl>, SE_T008_002 <dbl>,
## #   SE_T008_003 <dbl>, SE_T008_004 <dbl>, SE_NV006_001 <dbl>,
## #   SE_NV006_002 <dbl>, SE_NV006_003 <dbl>, SE_NV006_004 <dbl>,
## #   SE_T009_001 <dbl>, SE_T009_002 <dbl>, SE_NV007_001 <lgl>,
## #   SE_NV007_002 <dbl>, SE_T010_001 <dbl>, SE_T010_002 <dbl>,
## #   SE_T010_003 <dbl>, SE_NV008_001 <dbl>, SE_NV008_002 <dbl>,
## #   SE_NV008_003 <dbl>, SE_T011_001 <dbl>, SE_T011_002 <dbl>,
## #   SE_T012_001 <dbl>, SE_T012_002 <dbl>, SE_T012_003 <dbl>,
## #   SE_T012_004 <dbl>, SE_T012_005 <dbl>, SE_T013_001 <dbl>

Renaming Variables

I have changed the names of some variables to make the data clearer.

library(dplyr)
healthdataNew <- rename(healthdata, CountyName = Geo_NAME,
                State = Geo_STATE,
                County = Geo_COUNTY,
                PUnhealthyDays = SE_T001_001,
                MUnhealthyDays = SE_T001_002,
                FairPoorHealth = SE_T002_001,
                LowBirthWTPct = SE_T003_001,
                LowBirthWTAmt = SE_NV002_001,
                PCP = SE_T004_001,
                MHP = SE_T004_002,
                Dentists = SE_T004_003,
                NoInsurChild = SE_NV005_001,
                NoInsurAdult = SE_NV005_002,
                NoInsurSenior = SE_NV005_003,
                PreMDeath = SE_T007_001,
                InfantMort = SE_T008_001,
                ChildMort = SE_T008_002,
                DrugPoisMort = SE_T008_004,
                Diabetic = SE_T009_001,
                TeenBirth = SE_T010_001,
                Chlamydia = SE_T010_002,
                HIVPrev = SE_T010_003)
names(healthdataNew) #this will show if the names have actually been changed
##  [1] "Geo_FIPS"       "CountyName"     "Geo_QNAME"      "State"         
##  [5] "County"         "PUnhealthyDays" "MUnhealthyDays" "FairPoorHealth"
##  [9] "LowBirthWTPct"  "LowBirthWTAmt"  "PCP"            "MHP"           
## [13] "Dentists"       "SE_NV003_001"   "SE_NV003_002"   "SE_NV003_003"  
## [17] "SE_T005_001"    "SE_T006_001"    "SE_T006_002"    "SE_T006_003"   
## [21] "NoInsurChild"   "NoInsurAdult"   "NoInsurSenior"  "PreMDeath"     
## [25] "SE_T007_002"    "InfantMort"     "ChildMort"      "SE_T008_003"   
## [29] "DrugPoisMort"   "SE_NV006_001"   "SE_NV006_002"   "SE_NV006_003"  
## [33] "SE_NV006_004"   "Diabetic"       "SE_T009_002"    "SE_NV007_001"  
## [37] "SE_NV007_002"   "TeenBirth"      "Chlamydia"      "HIVPrev"       
## [41] "SE_NV008_001"   "SE_NV008_002"   "SE_NV008_003"   "SE_T011_001"   
## [45] "SE_T011_002"    "SE_T012_001"    "SE_T012_002"    "SE_T012_003"   
## [49] "SE_T012_004"    "SE_T012_005"    "SE_T013_001"

Recoding Variables

I have recoded the State variable, that uses numbers to represent the states, into the new variable Region. The regions created are:

  1. New England: Connecticut, Rhode Island, Massachusetts, Vermont, New Hampshire, Maine.
  2. Mid-Atlantic: New York, Pennsylvania, New Jersey, Delaware, Maryland, Washington D.C.
  3. South: Virginia, West Virginia, Tennessee, South Carolina, North Carolina, Lousiana, Mississippi, Kentucky, Georgia, Florida, Arkansas, Alabama.
  4. Midwest: Ohio, Michigan, Indiana, Illinois, Wisconsin, Missouri, Iowa, Minnesota, North Dakota, South Dakota, Nebraska, Kansas.
  5. Southwest: Texas, Oklahoma, New Mexico, Arizona.
  6. West: Colorado, Wyoming, Montana, Idaho, Utah, Nevada, Washington, Oregon, California.
healthdataNew1<-mutate(healthdataNew, Region=State)
healthdataNew1$Region<-recode(healthdataNew1$Region,
'01'=3,'02'=6,'04'=5,'05'=3,'06'=6,'08'=6,'09'=1,'10'=2,'11'=2,'12'=3,'13'=3, '15'=6,'16'=6,'17'=4,'18'=4,'19'=4,'20'=4,'21'=3,'22'=3,'23'=1,'24'=2,'25'=1,'26'=4,'27'=4,'28'=3,'29'=4,'30'=6,'31'=4,'32'=6,'33'=1,'34'=2,'35'=5,'36'=2,'37'=3,'38'=4, '39'=4,'40'=5,'41'=6,'42'=2,'44'=1,'45'=3,'46'=4,'47'=3,'48'=5,'49'=6,'50'=1,'51'=3,'53'=6,'54'=3,'55'=4,'56'=6)

Keeping Variables

I kept 5 variables to make the data easier to navigate.

healthdataNew2 <- select(healthdataNew1, State, Region, NoInsurChild, NoInsurAdult, NoInsurSenior)
dim(healthdataNew1) #this will show how many variables are in the orignal file
## [1] 3141   52
dim(healthdataNew2) #this will show how many variables are in the new file where the variables were kept
## [1] 3141    5

Filtering Data

I have filtered the data so it will only show the cases where more than 500,000 seniors are uninsured.

filter(healthdataNew2, NoInsurSenior > 500000)
## # A tibble: 7 x 5
##   State Region NoInsurChild NoInsurAdult NoInsurSenior
##   <chr>  <dbl>        <dbl>        <dbl>         <dbl>
## 1 04         5       124336       563580        675795
## 2 17         4        62867       774524        830462
## 3 06         6       204554      1885666       2064660
## 4 06         6        61200       453081        506359
## 5 12         3        73467       661258        725898
## 6 48         5       105463       554447        650012
## 7 48         5       173822       912690       1071270

Generating New Variables

The variable InsuranceDiff will show the difference among Seniors (age 65+) and Adults (age 18-64) that are uninsured.

mutate(healthdataNew2, InsuranceDiff = NoInsurSenior - NoInsurAdult)
## # A tibble: 3,141 x 6
##    State Region NoInsurChild NoInsurAdult NoInsurSenior InsuranceDiff
##    <chr>  <dbl>        <dbl>        <dbl>         <dbl>         <dbl>
##  1 01         3          563         5852          6351           499
##  2 04         5         3327        11779         14874          3095
##  3 05         3          279         2519          2769           250
##  4 06         6        21348       178422        197119         18697
##  5 08         6        14374        75008         88148         13140
##  6 09         1        13960        92568        105291         12723
##  7 10         2         1957        12719         14497          1778
##  8 11         2         3519        37844         40951          3107
##  9 12         3         4613        36430         40292          3862
## 10 13         3          588         3134          3682           548
## # … with 3,131 more rows
names(healthdataNew2) #to show the new variable was created
## [1] "State"         "Region"        "NoInsurChild"  "NoInsurAdult" 
## [5] "NoInsurSenior"

Summarizing Variables

This will show which regions have the most uninsured residents.

healthdataNew2 <- na.omit(healthdataNew2)

by_Region <- group_by(healthdataNew2, Region)
healthdataNew3 <- summarize(by_Region, Region_TotalUninsured = sum(NoInsurChild, NoInsurAdult, NoInsurSenior), Region_ChildUninsured = sum(NoInsurChild), Region_AdultUninsured = sum(NoInsurAdult), Region_SeniorUninsured = sum(NoInsurSenior))

print(healthdataNew3)
## # A tibble: 6 x 5
##   Region Region_TotalUni… Region_ChildUni… Region_AdultUni…
##    <dbl>            <dbl>            <dbl>            <dbl>
## 1      1          2026097           105557           913114
## 2      2         10204198           535283          4594013
## 3      3         25861644          1519922         11489315
## 4      4         14943863           904248          6611354
## 5      5         15798067          1333518          6630093
## 6      6         20705003          1326149          9098330
## # … with 1 more variable: Region_SeniorUninsured <dbl>

From this data we can see that Region 3, the South has the highest population of uninsured residents.