I have selected the U.S. Health Data (2016) file from Social Explorer to examine further. I will be examining the number of people that do not have health insurance. Which region of the United States has the highest number of uninsured residents?
library(readr)
healthdata <- read_csv("/Users/rachel_ramphal/Documents/healthdata.csv")
library(knitr)
head(healthdata)
## # A tibble: 6 x 51
## Geo_FIPS Geo_NAME Geo_QNAME Geo_STATE Geo_COUNTY SE_T001_001 SE_T001_002
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 01001 Autauga… Autauga … 01 001 4.4 4.3
## 2 04001 Apache … Apache C… 04 001 5.7 5.3
## 3 05001 Arkansa… Arkansas… 05 001 4.6 4.2
## 4 06001 Alameda… Alameda … 06 001 3.4 3.3
## 5 08001 Adams C… Adams Co… 08 001 3.5 3.3
## 6 09001 Fairfie… Fairfiel… 09 001 2.8 3.2
## # … with 44 more variables: SE_T002_001 <dbl>, SE_T003_001 <dbl>,
## # SE_NV002_001 <dbl>, SE_T004_001 <dbl>, SE_T004_002 <dbl>,
## # SE_T004_003 <dbl>, SE_NV003_001 <dbl>, SE_NV003_002 <dbl>,
## # SE_NV003_003 <dbl>, SE_T005_001 <dbl>, SE_T006_001 <dbl>,
## # SE_T006_002 <dbl>, SE_T006_003 <dbl>, SE_NV005_001 <dbl>,
## # SE_NV005_002 <dbl>, SE_NV005_003 <dbl>, SE_T007_001 <dbl>,
## # SE_T007_002 <dbl>, SE_T008_001 <dbl>, SE_T008_002 <dbl>,
## # SE_T008_003 <dbl>, SE_T008_004 <dbl>, SE_NV006_001 <dbl>,
## # SE_NV006_002 <dbl>, SE_NV006_003 <dbl>, SE_NV006_004 <dbl>,
## # SE_T009_001 <dbl>, SE_T009_002 <dbl>, SE_NV007_001 <lgl>,
## # SE_NV007_002 <dbl>, SE_T010_001 <dbl>, SE_T010_002 <dbl>,
## # SE_T010_003 <dbl>, SE_NV008_001 <dbl>, SE_NV008_002 <dbl>,
## # SE_NV008_003 <dbl>, SE_T011_001 <dbl>, SE_T011_002 <dbl>,
## # SE_T012_001 <dbl>, SE_T012_002 <dbl>, SE_T012_003 <dbl>,
## # SE_T012_004 <dbl>, SE_T012_005 <dbl>, SE_T013_001 <dbl>
I have changed the names of some variables to make the data clearer.
library(dplyr)
healthdataNew <- rename(healthdata, CountyName = Geo_NAME,
State = Geo_STATE,
County = Geo_COUNTY,
PUnhealthyDays = SE_T001_001,
MUnhealthyDays = SE_T001_002,
FairPoorHealth = SE_T002_001,
LowBirthWTPct = SE_T003_001,
LowBirthWTAmt = SE_NV002_001,
PCP = SE_T004_001,
MHP = SE_T004_002,
Dentists = SE_T004_003,
NoInsurChild = SE_NV005_001,
NoInsurAdult = SE_NV005_002,
NoInsurSenior = SE_NV005_003,
PreMDeath = SE_T007_001,
InfantMort = SE_T008_001,
ChildMort = SE_T008_002,
DrugPoisMort = SE_T008_004,
Diabetic = SE_T009_001,
TeenBirth = SE_T010_001,
Chlamydia = SE_T010_002,
HIVPrev = SE_T010_003)
names(healthdataNew) #this will show if the names have actually been changed
## [1] "Geo_FIPS" "CountyName" "Geo_QNAME" "State"
## [5] "County" "PUnhealthyDays" "MUnhealthyDays" "FairPoorHealth"
## [9] "LowBirthWTPct" "LowBirthWTAmt" "PCP" "MHP"
## [13] "Dentists" "SE_NV003_001" "SE_NV003_002" "SE_NV003_003"
## [17] "SE_T005_001" "SE_T006_001" "SE_T006_002" "SE_T006_003"
## [21] "NoInsurChild" "NoInsurAdult" "NoInsurSenior" "PreMDeath"
## [25] "SE_T007_002" "InfantMort" "ChildMort" "SE_T008_003"
## [29] "DrugPoisMort" "SE_NV006_001" "SE_NV006_002" "SE_NV006_003"
## [33] "SE_NV006_004" "Diabetic" "SE_T009_002" "SE_NV007_001"
## [37] "SE_NV007_002" "TeenBirth" "Chlamydia" "HIVPrev"
## [41] "SE_NV008_001" "SE_NV008_002" "SE_NV008_003" "SE_T011_001"
## [45] "SE_T011_002" "SE_T012_001" "SE_T012_002" "SE_T012_003"
## [49] "SE_T012_004" "SE_T012_005" "SE_T013_001"
I have recoded the State variable, that uses numbers to represent the states, into the new variable Region. The regions created are:
healthdataNew1<-mutate(healthdataNew, Region=State)
healthdataNew1$Region<-recode(healthdataNew1$Region,
'01'=3,'02'=6,'04'=5,'05'=3,'06'=6,'08'=6,'09'=1,'10'=2,'11'=2,'12'=3,'13'=3, '15'=6,'16'=6,'17'=4,'18'=4,'19'=4,'20'=4,'21'=3,'22'=3,'23'=1,'24'=2,'25'=1,'26'=4,'27'=4,'28'=3,'29'=4,'30'=6,'31'=4,'32'=6,'33'=1,'34'=2,'35'=5,'36'=2,'37'=3,'38'=4, '39'=4,'40'=5,'41'=6,'42'=2,'44'=1,'45'=3,'46'=4,'47'=3,'48'=5,'49'=6,'50'=1,'51'=3,'53'=6,'54'=3,'55'=4,'56'=6)
I kept 5 variables to make the data easier to navigate.
healthdataNew2 <- select(healthdataNew1, State, Region, NoInsurChild, NoInsurAdult, NoInsurSenior)
dim(healthdataNew1) #this will show how many variables are in the orignal file
## [1] 3141 52
dim(healthdataNew2) #this will show how many variables are in the new file where the variables were kept
## [1] 3141 5
I have filtered the data so it will only show the cases where more than 500,000 seniors are uninsured.
filter(healthdataNew2, NoInsurSenior > 500000)
## # A tibble: 7 x 5
## State Region NoInsurChild NoInsurAdult NoInsurSenior
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 04 5 124336 563580 675795
## 2 17 4 62867 774524 830462
## 3 06 6 204554 1885666 2064660
## 4 06 6 61200 453081 506359
## 5 12 3 73467 661258 725898
## 6 48 5 105463 554447 650012
## 7 48 5 173822 912690 1071270
The variable InsuranceDiff will show the difference among Seniors (age 65+) and Adults (age 18-64) that are uninsured.
mutate(healthdataNew2, InsuranceDiff = NoInsurSenior - NoInsurAdult)
## # A tibble: 3,141 x 6
## State Region NoInsurChild NoInsurAdult NoInsurSenior InsuranceDiff
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 01 3 563 5852 6351 499
## 2 04 5 3327 11779 14874 3095
## 3 05 3 279 2519 2769 250
## 4 06 6 21348 178422 197119 18697
## 5 08 6 14374 75008 88148 13140
## 6 09 1 13960 92568 105291 12723
## 7 10 2 1957 12719 14497 1778
## 8 11 2 3519 37844 40951 3107
## 9 12 3 4613 36430 40292 3862
## 10 13 3 588 3134 3682 548
## # … with 3,131 more rows
names(healthdataNew2) #to show the new variable was created
## [1] "State" "Region" "NoInsurChild" "NoInsurAdult"
## [5] "NoInsurSenior"
This will show which regions have the most uninsured residents.
healthdataNew2 <- na.omit(healthdataNew2)
by_Region <- group_by(healthdataNew2, Region)
healthdataNew3 <- summarize(by_Region, Region_TotalUninsured = sum(NoInsurChild, NoInsurAdult, NoInsurSenior), Region_ChildUninsured = sum(NoInsurChild), Region_AdultUninsured = sum(NoInsurAdult), Region_SeniorUninsured = sum(NoInsurSenior))
print(healthdataNew3)
## # A tibble: 6 x 5
## Region Region_TotalUni… Region_ChildUni… Region_AdultUni…
## <dbl> <dbl> <dbl> <dbl>
## 1 1 2026097 105557 913114
## 2 2 10204198 535283 4594013
## 3 3 25861644 1519922 11489315
## 4 4 14943863 904248 6611354
## 5 5 15798067 1333518 6630093
## 6 6 20705003 1326149 9098330
## # … with 1 more variable: Region_SeniorUninsured <dbl>
From this data we can see that Region 3, the South has the highest population of uninsured residents.