setwd("~/drexel_masters/data/INFO659")
HIMP <- read.csv("final/HI_rate.csv")
head(HIMP, 5)
HI <- HIMP[,c(3,9,12,13,14,15,16,17,18,19,20,21,22,23,24)]
head(HI,5)
nrow(HI)
[1] 549999
summary(HI)
StateCode FederalTIN PlanId
AK: 5657 59-2015694: 77089 48129FL0160004: 6586
AL: 40564 86-0307623: 60713 48129FL0160002: 6545
AZ: 75624 47-0397286: 52441 48129FL0160001: 6537
FL:316390 36-3097810: 45683 16842FL0010001: 6510
GA: 59671 59-2403696: 33956 16842FL0010002: 6480
IN: 52093 94-2761537: 33864 48129FL0160003: 6329
(Other) :246253 (Other) :511012
RatingAreaId Tobacco
Rating Area 5: 31743 No Preference :366451
Rating Area 4: 30439 Tobacco User/Non-Tobacco User:183548
Rating Area 6: 28668
Rating Area 2: 28311
Rating Area 3: 28001
Rating Area 1: 25233
(Other) :377604
Age IndividualRate IndividualTobaccoRate
26 : 12122 Min. : 0.0 Min. : 67.2
44 : 12107 1st Qu.: 25.7 1st Qu.: 350.0
38 : 12088 Median : 233.0 Median : 472.0
47 : 12051 Mean : 6917.9 Mean : 553.6
0-20 : 12036 3rd Qu.: 413.6 3rd Qu.: 699.4
52 : 12028 Max. :999999.0 Max. :2132.6
(Other):477567 NA's :365958
Couple PrimarySubscriberAndOneDependent
Min. : 15.6 Min. : 20.9
1st Qu.: 29.2 1st Qu.: 29.7
Median : 38.9 Median : 41.7
Mean : 43.2 Mean : 44.1
3rd Qu.: 56.0 3rd Qu.: 55.6
Max. :182.4 Max. :169.7
NA's :548575 NA's :548575
PrimarySubscriberAndTwoDependents
Min. : 24.0
1st Qu.: 30.9
Median : 62.0
Mean : 60.3
3rd Qu.: 81.0
Max. :248.1
NA's :548575
PrimarySubscriberAndThreeOrMoreDependents
Min. : 24.0
1st Qu.: 30.9
Median : 85.2
Mean : 79.7
3rd Qu.:112.2
Max. :357.9
NA's :548575
CoupleAndOneDependent CoupleAndTwoDependents
Min. : 24.0 Min. : 24.0
1st Qu.: 31.8 1st Qu.: 33.8
Median : 62.1 Median : 78.5
Mean : 62.4 Mean : 78.5
3rd Qu.: 82.7 3rd Qu.:109.4
Max. :260.9 Max. :339.3
NA's :548575 NA's :548575
CoupleAndThreeOrMoreDependents
Min. : 24.0
1st Qu.: 33.8
Median :105.8
Mean : 98.0
3rd Qu.:139.5
Max. :449.1
NA's :548575
library(ggplot2)
ggplot(HI, aes(x=StateCode, y=IndividualRate)) +
geom_boxplot() + scale_color_manual(values=c("black","black"))