A. Data Preparation

Data preparation and loading:

setwd("~/drexel_masters/data/INFO659")
HIMP <- read.csv("final/HI_rate.csv")
head(HIMP, 5)

Creating subset for analysis

HI <- HIMP[,c(3,9,12,14,15,16,17,18,19,20,21,22,23,24)]
head(HI,5)
nrow(HI)
[1] 549999
summary(HI)
 StateCode        FederalTIN                PlanId      
 AK:  5657   59-2015694: 77089   48129FL0160004:  6586  
 AL: 40564   86-0307623: 60713   48129FL0160002:  6545  
 AZ: 75624   47-0397286: 52441   48129FL0160001:  6537  
 FL:316390   36-3097810: 45683   16842FL0010001:  6510  
 GA: 59671   59-2403696: 33956   16842FL0010002:  6480  
 IN: 52093   94-2761537: 33864   48129FL0160003:  6329  
             (Other)   :246253   (Other)       :511012  
                          Tobacco            Age        
 No Preference                :366451   26     : 12122  
 Tobacco User/Non-Tobacco User:183548   44     : 12107  
                                        38     : 12088  
                                        47     : 12051  
                                        0-20   : 12036  
                                        52     : 12028  
                                        (Other):477567  
 IndividualRate     IndividualTobaccoRate     Couple      
 Min.   :     0.0   Min.   :  67.2        Min.   : 15.6   
 1st Qu.:    25.7   1st Qu.: 350.0        1st Qu.: 29.2   
 Median :   233.0   Median : 472.0        Median : 38.9   
 Mean   :  6917.9   Mean   : 553.6        Mean   : 43.2   
 3rd Qu.:   413.6   3rd Qu.: 699.4        3rd Qu.: 56.0   
 Max.   :999999.0   Max.   :2132.6        Max.   :182.4   
                    NA's   :365958        NA's   :548575  
 PrimarySubscriberAndOneDependent
 Min.   : 20.9                   
 1st Qu.: 29.7                   
 Median : 41.7                   
 Mean   : 44.1                   
 3rd Qu.: 55.6                   
 Max.   :169.7                   
 NA's   :548575                  
 PrimarySubscriberAndTwoDependents
 Min.   : 24.0                    
 1st Qu.: 30.9                    
 Median : 62.0                    
 Mean   : 60.3                    
 3rd Qu.: 81.0                    
 Max.   :248.1                    
 NA's   :548575                   
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : 24.0                            
 1st Qu.: 30.9                            
 Median : 85.2                            
 Mean   : 79.7                            
 3rd Qu.:112.2                            
 Max.   :357.9                            
 NA's   :548575                           
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : 24.0         Min.   : 24.0         
 1st Qu.: 31.8         1st Qu.: 33.8         
 Median : 62.1         Median : 78.5         
 Mean   : 62.4         Mean   : 78.5         
 3rd Qu.: 82.7         3rd Qu.:109.4         
 Max.   :260.9         Max.   :339.3         
 NA's   :548575        NA's   :548575        
 CoupleAndThreeOrMoreDependents
 Min.   : 24.0                 
 1st Qu.: 33.8                 
 Median :105.8                 
 Mean   : 98.0                 
 3rd Qu.:139.5                 
 Max.   :449.1                 
 NA's   :548575                

The column “Tobacco” contains two values: “No preference” (366,451 accounts) and “Tobacco User/Non-Tobacco User” (183,548 accounts). Therefore, almost 2/3 consumers were enrolled in plans that did not determine an insurance premium based on smoking status.

The values of the variable “IndividualRate”ranged from 0 to $999,999.0. The accounts with values of “0” and “999,999.0” are unexpected values. They do not make sense and should be considered as outliers. Therefore, these accounts should be excluded from further analyses.

outlier1 <- subset(HI, IndividualRate ==0)
outlier1
summary(outlier1)
 StateCode      FederalTIN              PlanId    
 AK: 579   59-2876465:6467   30115FL0010001:3252  
 AL:   0   34-0648820:1215   30115FL0020001:3215  
 AZ:1680   47-0098400: 970   43499IN0070001: 419  
 FL:6467   93-0242990: 953   43499IN0070002: 406  
 GA:   0   91-1857813: 336   43499IN0060001: 390  
 IN:1215   06-6033492:   0   52147AZ0040002: 179  
           (Other)   :   0   (Other)       :2080  
                          Tobacco          Age      
 No Preference                :9941   33     : 243  
 Tobacco User/Non-Tobacco User:   0   32     : 240  
                                      57     : 238  
                                      35     : 236  
                                      62     : 235  
                                      23     : 233  
                                      (Other):8516  
 IndividualRate IndividualTobaccoRate     Couple    
 Min.   :0      Min.   : NA           Min.   : NA   
 1st Qu.:0      1st Qu.: NA           1st Qu.: NA   
 Median :0      Median : NA           Median : NA   
 Mean   :0      Mean   :NaN           Mean   :NaN   
 3rd Qu.:0      3rd Qu.: NA           3rd Qu.: NA   
 Max.   :0      Max.   : NA           Max.   : NA   
                NA's   :9941          NA's   :9941  
 PrimarySubscriberAndOneDependent
 Min.   : NA                     
 1st Qu.: NA                     
 Median : NA                     
 Mean   :NaN                     
 3rd Qu.: NA                     
 Max.   : NA                     
 NA's   :9941                    
 PrimarySubscriberAndTwoDependents
 Min.   : NA                      
 1st Qu.: NA                      
 Median : NA                      
 Mean   :NaN                      
 3rd Qu.: NA                      
 Max.   : NA                      
 NA's   :9941                     
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : NA                              
 1st Qu.: NA                              
 Median : NA                              
 Mean   :NaN                              
 3rd Qu.: NA                              
 Max.   : NA                              
 NA's   :9941                             
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : NA           Min.   : NA           
 1st Qu.: NA           1st Qu.: NA           
 Median : NA           Median : NA           
 Mean   :NaN           Mean   :NaN           
 3rd Qu.: NA           3rd Qu.: NA           
 Max.   : NA           Max.   : NA           
 NA's   :9941          NA's   :9941          
 CoupleAndThreeOrMoreDependents
 Min.   : NA                   
 1st Qu.: NA                   
 Median : NA                   
 Mean   :NaN                   
 3rd Qu.: NA                   
 Max.   : NA                   
 NA's   :9941                  

Only four FederalTins were associated with the “0” outliers

outlier2 <- subset(HI, IndividualRate ==999999.0)
outlier2
summary(outlier2)
 StateCode      FederalTIN              PlanId    
 AK: 289   95-6042390:3643   83502GA0010002: 389  
 AL:1216   06-6033492:   0   83502GA0020001: 362  
 AZ: 666   13-5123390:   0   83502GA0020002: 361  
 FL:   0   13-5581829:   0   83502GA0010001: 360  
 GA:1472   14-1917982:   0   12538AL0010001: 312  
 IN:   0   20-3174593:   0   12538AL0020001: 304  
           (Other)   :   0   (Other)       :1555  
                          Tobacco          Age      
 No Preference                :3643   47     :  94  
 Tobacco User/Non-Tobacco User:   0   63     :  94  
                                      25     :  93  
                                      31     :  93  
                                      21     :  89  
                                      43     :  87  
                                      (Other):3093  
 IndividualRate  IndividualTobaccoRate     Couple    
 Min.   :1e+06   Min.   : NA           Min.   : NA   
 1st Qu.:1e+06   1st Qu.: NA           1st Qu.: NA   
 Median :1e+06   Median : NA           Median : NA   
 Mean   :1e+06   Mean   :NaN           Mean   :NaN   
 3rd Qu.:1e+06   3rd Qu.: NA           3rd Qu.: NA   
 Max.   :1e+06   Max.   : NA           Max.   : NA   
                 NA's   :3643          NA's   :3643  
 PrimarySubscriberAndOneDependent
 Min.   : NA                     
 1st Qu.: NA                     
 Median : NA                     
 Mean   :NaN                     
 3rd Qu.: NA                     
 Max.   : NA                     
 NA's   :3643                    
 PrimarySubscriberAndTwoDependents
 Min.   : NA                      
 1st Qu.: NA                      
 Median : NA                      
 Mean   :NaN                      
 3rd Qu.: NA                      
 Max.   : NA                      
 NA's   :3643                     
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : NA                              
 1st Qu.: NA                              
 Median : NA                              
 Mean   :NaN                              
 3rd Qu.: NA                              
 Max.   : NA                              
 NA's   :3643                             
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : NA           Min.   : NA           
 1st Qu.: NA           1st Qu.: NA           
 Median : NA           Median : NA           
 Mean   :NaN           Mean   :NaN           
 3rd Qu.: NA           3rd Qu.: NA           
 Max.   : NA           Max.   : NA           
 NA's   :3643          NA's   :3643          
 CoupleAndThreeOrMoreDependents
 Min.   : NA                   
 1st Qu.: NA                   
 Median : NA                   
 Mean   :NaN                   
 3rd Qu.: NA                   
 Max.   : NA                   
 NA's   :3643                  

Only one FederalTin was associated with the $999,999 outliers (the Best Life And Health Insurance Company).

Creating the new filtered subset without outliers:

filteredHI1 <- subset(HI, IndividualRate <999999.0)
newHI <- subset(filteredHI1, IndividualRate >0)
summary(newHI)
 StateCode        FederalTIN                PlanId      
 AK:  4789   59-2015694: 77089   48129FL0160004:  6586  
 AL: 39348   86-0307623: 60713   48129FL0160002:  6545  
 AZ: 73278   47-0397286: 52441   48129FL0160001:  6537  
 FL:309923   36-3097810: 45683   16842FL0010001:  6510  
 GA: 58199   59-2403696: 33956   16842FL0010002:  6480  
 IN: 50878   94-2761537: 33864   48129FL0160003:  6329  
             (Other)   :232669   (Other)       :497428  
                          Tobacco            Age        
 No Preference                :352867   0-20   : 12036  
 Tobacco User/Non-Tobacco User:183548   44     : 11830  
                                        26     : 11818  
                                        38     : 11790  
                                        52     : 11738  
                                        47     : 11727  
                                        (Other):465476  
 IndividualRate    IndividualTobaccoRate     Couple      
 Min.   :   7.82   Min.   :  67.2        Min.   : 15.6   
 1st Qu.:  26.74   1st Qu.: 350.0        1st Qu.: 29.2   
 Median : 237.94   Median : 472.0        Median : 38.9   
 Mean   : 301.67   Mean   : 553.6        Mean   : 43.2   
 3rd Qu.: 413.06   3rd Qu.: 699.4        3rd Qu.: 56.0   
 Max.   :9999.00   Max.   :2132.6        Max.   :182.4   
                   NA's   :352374        NA's   :534991  
 PrimarySubscriberAndOneDependent
 Min.   : 20.9                   
 1st Qu.: 29.7                   
 Median : 41.7                   
 Mean   : 44.1                   
 3rd Qu.: 55.6                   
 Max.   :169.7                   
 NA's   :534991                  
 PrimarySubscriberAndTwoDependents
 Min.   : 24.0                    
 1st Qu.: 30.9                    
 Median : 62.0                    
 Mean   : 60.3                    
 3rd Qu.: 81.0                    
 Max.   :248.1                    
 NA's   :534991                   
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : 24.0                            
 1st Qu.: 30.9                            
 Median : 85.2                            
 Mean   : 79.7                            
 3rd Qu.:112.2                            
 Max.   :357.9                            
 NA's   :534991                           
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : 24.0         Min.   : 24.0         
 1st Qu.: 31.8         1st Qu.: 33.8         
 Median : 62.1         Median : 78.5         
 Mean   : 62.4         Mean   : 78.5         
 3rd Qu.: 82.7         3rd Qu.:109.4         
 Max.   :260.9         Max.   :339.3         
 NA's   :534991        NA's   :534991        
 CoupleAndThreeOrMoreDependents
 Min.   : 24.0                 
 1st Qu.: 33.8                 
 Median :105.8                 
 Mean   : 98.0                 
 3rd Qu.:139.5                 
 Max.   :449.1                 
 NA's   :534991                
nrow(newHI)
[1] 536415

A.2. Data distribution and anomalies

library(ggplot2)

State code vs IndividualRate

ggplot(newHI, aes(x=StateCode, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black")) 

The visualization of the distribution of the insurance rate across the individual states showed more outliers with value “9,999.0”. Only one FederalTIN (belongs to the Cigna Health and Life Insurance Company) was associated with the $9999 outliers. These outliers will be excluded from the data.

outlier3 <- subset(newHI, IndividualRate ==9999.0)
summary(outlier3)
 StateCode      FederalTIN              PlanId    
 AK:   0   59-1031071:1992   48121FL0030001:1580  
 AL:   0   06-6033492:   0   50491GA0030001: 253  
 AZ: 159   13-5123390:   0   86830AZ0050001: 159  
 FL:1580   13-5581829:   0   12538AL0010001:   0  
 GA: 253   14-1917982:   0   12538AL0010002:   0  
 IN:   0   20-3174593:   0   12538AL0010003:   0  
           (Other)   :   0   (Other)       :   0  
                          Tobacco              Age      
 No Preference                :1992   62         :  58  
 Tobacco User/Non-Tobacco User:   0   45         :  55  
                                      65 and over:  55  
                                      51         :  53  
                                      25         :  50  
                                      52         :  50  
                                      (Other)    :1671  
 IndividualRate IndividualTobaccoRate     Couple    
 Min.   :9999   Min.   : NA           Min.   : NA   
 1st Qu.:9999   1st Qu.: NA           1st Qu.: NA   
 Median :9999   Median : NA           Median : NA   
 Mean   :9999   Mean   :NaN           Mean   :NaN   
 3rd Qu.:9999   3rd Qu.: NA           3rd Qu.: NA   
 Max.   :9999   Max.   : NA           Max.   : NA   
                NA's   :1992          NA's   :1992  
 PrimarySubscriberAndOneDependent
 Min.   : NA                     
 1st Qu.: NA                     
 Median : NA                     
 Mean   :NaN                     
 3rd Qu.: NA                     
 Max.   : NA                     
 NA's   :1992                    
 PrimarySubscriberAndTwoDependents
 Min.   : NA                      
 1st Qu.: NA                      
 Median : NA                      
 Mean   :NaN                      
 3rd Qu.: NA                      
 Max.   : NA                      
 NA's   :1992                     
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : NA                              
 1st Qu.: NA                              
 Median : NA                              
 Mean   :NaN                              
 3rd Qu.: NA                              
 Max.   : NA                              
 NA's   :1992                             
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : NA           Min.   : NA           
 1st Qu.: NA           1st Qu.: NA           
 Median : NA           Median : NA           
 Mean   :NaN           Mean   :NaN           
 3rd Qu.: NA           3rd Qu.: NA           
 Max.   : NA           Max.   : NA           
 NA's   :1992          NA's   :1992          
 CoupleAndThreeOrMoreDependents
 Min.   : NA                   
 1st Qu.: NA                   
 Median : NA                   
 Mean   :NaN                   
 3rd Qu.: NA                   
 Max.   : NA                   
 NA's   :1992                  

Creating the final subset for our analysis

finalHI <- subset(newHI, IndividualRate <9999.0)
summary(finalHI)
 StateCode        FederalTIN                PlanId      
 AK:  4789   59-2015694: 77089   48129FL0160004:  6586  
 AL: 39348   86-0307623: 60713   48129FL0160002:  6545  
 AZ: 73119   47-0397286: 52441   48129FL0160001:  6537  
 FL:308343   36-3097810: 45683   16842FL0010001:  6510  
 GA: 57946   59-2403696: 33956   16842FL0010002:  6480  
 IN: 50878   94-2761537: 33864   48129FL0160003:  6329  
             (Other)   :230677   (Other)       :495436  
                          Tobacco            Age        
 No Preference                :350875   0-20   : 12036  
 Tobacco User/Non-Tobacco User:183548   44     : 11784  
                                        26     : 11776  
                                        38     : 11757  
                                        52     : 11688  
                                        47     : 11686  
                                        (Other):463696  
 IndividualRate    IndividualTobaccoRate     Couple      
 Min.   :   7.82   Min.   :  67.2        Min.   : 15.6   
 1st Qu.:  26.73   1st Qu.: 350.0        1st Qu.: 29.2   
 Median : 236.36   Median : 472.0        Median : 38.9   
 Mean   : 265.53   Mean   : 553.6        Mean   : 43.2   
 3rd Qu.: 410.22   3rd Qu.: 699.4        3rd Qu.: 56.0   
 Max.   :1898.28   Max.   :2132.6        Max.   :182.4   
                   NA's   :350382        NA's   :532999  
 PrimarySubscriberAndOneDependent
 Min.   : 20.9                   
 1st Qu.: 29.7                   
 Median : 41.7                   
 Mean   : 44.1                   
 3rd Qu.: 55.6                   
 Max.   :169.7                   
 NA's   :532999                  
 PrimarySubscriberAndTwoDependents
 Min.   : 24.0                    
 1st Qu.: 30.9                    
 Median : 62.0                    
 Mean   : 60.3                    
 3rd Qu.: 81.0                    
 Max.   :248.1                    
 NA's   :532999                   
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : 24.0                            
 1st Qu.: 30.9                            
 Median : 85.2                            
 Mean   : 79.7                            
 3rd Qu.:112.2                            
 Max.   :357.9                            
 NA's   :532999                           
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : 24.0         Min.   : 24.0         
 1st Qu.: 31.8         1st Qu.: 33.8         
 Median : 62.1         Median : 78.5         
 Mean   : 62.4         Mean   : 78.5         
 3rd Qu.: 82.7         3rd Qu.:109.4         
 Max.   :260.9         Max.   :339.3         
 NA's   :532999        NA's   :532999        
 CoupleAndThreeOrMoreDependents
 Min.   : 24.0                 
 1st Qu.: 33.8                 
 Median :105.8                 
 Mean   : 98.0                 
 3rd Qu.:139.5                 
 Max.   :449.1                 
 NA's   :532999                
View(finalHI)

Plotting State code vs IndividualRate

ggplot(finalHI, aes(x=StateCode, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black")) 

FederalTIN

ggplot(finalHI, aes(x=FederalTIN, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black"))

summary(finalHI$FederalTIN)
06-6033492 13-5123390 13-5581829 14-1917982 20-3174593 
      4361       8649       4780       6467       1976 
20-8937577 26-0155137 26-3238817 27-3835905 31-1069321 
      1975        220      16222       2902      25135 
34-0648820 35-0472300 35-0781558 36-3097810 36-3691770 
      5775       7698      17140      45683        838 
39-1263473 39-1821211 45-3131932 45-4370907 46-1509576 
      2694       1990       1741       1200        542 
46-2210067 47-0098400 47-0397286 58-0469845 58-1592076 
       771         17      52441       3100       7156 
58-1638390 58-2209549 58-2335921 59-1031071 59-1419293 
      9641       2029       1488       9435        113 
59-2015694 59-2403696 59-2876465 59-3315064 61-1013183 
     77089      33956       6605       3150        229 
61-1103898 63-0103830 63-1028262 65-0073323 65-0743731 
      1751       8964       5567        133       6636 
65-0986441 75-1233841 86-0004538 86-0207231 86-0274899 
      1591      19869       7712      15039       2700 
86-0307623 91-1857813 93-0242990 93-0438772 93-0989307 
     60713         21         17         69       4317 
94-2761537 95-6042390 
     33864        252 
plot(finalHI$StateCode, finalHI$FederalTIN)

ggplot(finalHI, aes(x=Tobacco, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black"))

Age vs. Individual Rate

ggplot(finalHI, aes(x=Age, y=IndividualRate)) + 
  geom_boxplot() 

Producing a histogram for the individual insurance rates

hist(finalHI$IndividualRate, breaks=50, xlab="IndividualRate", main="Individual Rate Values distribution")

Without the outliers, the values of the individual health insurance rates ranged from $7.82 to $1.898.28 with median $236.36 and $265.53. The histogram shape is skewed to the left (Fig.2). Almost half individual rates are in the groups below $100.

Producing a histogram for individual rates with log transformation

hist(log10(finalHI$IndividualRate), breaks=50, xlab="Individual Rate", main="Individual Rates: with log transformation")

With log transformation, the distribution of individual rates became bimodal with two peaks of data, which usually indicates the presence of two different groups (Fig.3). Therefore, we need to find out how different these groups are. Based on the histogram (Fig. 2), we assumed that the first group included individual rates with values below $100 and the second group included accounts with rates above $100. For further analysis, we created two different subsets: “Cheap Health Insurance” and “Expensive Health Insurance.”

cheapHI <- subset(finalHI, IndividualRate<=100)
summary(cheapHI)
 StateCode        FederalTIN               PlanId      
 AK:   472   86-0307623:60713   48129FL0160004:  6586  
 AL: 15739   47-0397286:52441   48129FL0160002:  6545  
 AZ: 10163   94-2761537:33864   48129FL0160001:  6537  
 FL:161346   75-1233841:19869   48129FL0160003:  6329  
 GA: 32668   13-5123390: 8649   30115FL0040001:  3241  
 IN: 14734   35-0472300: 7698   30115FL0050001:  3234  
             (Other)   :51888   (Other)       :202650  
                          Tobacco            Age        
 No Preference                :235020   0-20   :  5777  
 Tobacco User/Non-Tobacco User:   102   38     :  5184  
                                        44     :  5175  
                                        47     :  5159  
                                        64     :  5155  
                                        26     :  5151  
                                        (Other):203521  
 IndividualRate   IndividualTobaccoRate     Couple      
 Min.   :  7.82   Min.   : 67.16        Min.   : 15.61  
 1st Qu.: 18.79   1st Qu.: 87.00        1st Qu.: 29.24  
 Median : 24.61   Median : 98.25        Median : 38.94  
 Mean   : 28.12   Mean   : 98.93        Mean   : 43.18  
 3rd Qu.: 31.52   3rd Qu.:109.00        3rd Qu.: 56.03  
 Max.   :100.00   Max.   :137.53        Max.   :182.40  
                  NA's   :235020        NA's   :233698  
 PrimarySubscriberAndOneDependent
 Min.   : 20.92                  
 1st Qu.: 29.74                  
 Median : 41.73                  
 Mean   : 44.13                  
 3rd Qu.: 55.63                  
 Max.   :169.65                  
 NA's   :233698                  
 PrimarySubscriberAndTwoDependents
 Min.   : 24.01                   
 1st Qu.: 30.90                   
 Median : 61.97                   
 Mean   : 60.27                   
 3rd Qu.: 81.04                   
 Max.   :248.11                   
 NA's   :233698                   
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : 24.01                           
 1st Qu.: 30.90                           
 Median : 85.21                           
 Mean   : 79.73                           
 3rd Qu.:112.23                           
 Max.   :357.94                           
 NA's   :233698                           
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : 24.01        Min.   : 24.01        
 1st Qu.: 31.76        1st Qu.: 33.82        
 Median : 62.07        Median : 78.50        
 Mean   : 62.36        Mean   : 78.50        
 3rd Qu.: 82.70        3rd Qu.:109.37        
 Max.   :260.85        Max.   :339.31        
 NA's   :233698        NA's   :233698        
 CoupleAndThreeOrMoreDependents
 Min.   : 24.01                
 1st Qu.: 33.82                
 Median :105.85                
 Mean   : 97.96                
 3rd Qu.:139.53                
 Max.   :449.14                
 NA's   :233698                
ggplot(cheapHI, aes(x=Age, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black")) 

summary(cheapHI$FederalTIN)
06-6033492 13-5123390 13-5581829 14-1917982 20-3174593 
        14       8649        498       6467          1 
20-8937577 26-0155137 26-3238817 27-3835905 31-1069321 
         0          0          0          0       4819 
34-0648820 35-0472300 35-0781558 36-3097810 36-3691770 
      5775       7698       3302         65        838 
39-1263473 39-1821211 45-3131932 45-4370907 46-1509576 
      2466          0          0          1          0 
46-2210067 47-0098400 47-0397286 58-0469845 58-1592076 
         1         17      52441       3100          3 
58-1638390 58-2209549 58-2335921 59-1031071 59-1419293 
         0         11          0       1835          0 
59-2015694 59-2403696 59-2876465 59-3315064 61-1013183 
         2          0       6605          0          2 
61-1103898 63-0103830 63-1028262 65-0073323 65-0743731 
         7        622       5567        133       6636 
65-0986441 75-1233841 86-0004538 86-0207231 86-0274899 
        32      19869         10          0       2700 
86-0307623 91-1857813 93-0242990 93-0438772 93-0989307 
     60713         21         17         69          0 
94-2761537 95-6042390 
     33864        252 

plot(cheapHI$StateCode, cheapHI$FederalTIN)

plot(cheapHI$IndividualRate, cheapHI$Couple)

expensiveHI <- subset(finalHI, IndividualRate>100)
summary(expensiveHI)
 StateCode        FederalTIN               PlanId      
 AK:  4317   59-2015694:77087   16842FL0010001:  6510  
 AL: 23609   36-3097810:45618   16842FL0010002:  6480  
 AZ: 62956   59-2403696:33956   30252FL0010001:  4195  
 FL:146997   31-1069321:20316   30252FL0010002:  4195  
 GA: 25278   26-3238817:16222   16842FL0070088:  1671  
 IN: 36144   86-0207231:15039   17575IN0720001:  1670  
             (Other)   :91063   (Other)       :274580  
                          Tobacco                Age        
 No Preference                :115855   26         :  6625  
 Tobacco User/Non-Tobacco User:183446   44         :  6609  
                                        52         :  6591  
                                        57         :  6582  
                                        33         :  6580  
                                        65 and over:  6574  
                                        (Other)    :259740  
 IndividualRate   IndividualTobaccoRate     Couple      
 Min.   : 100.0   Min.   : 100.3        Min.   : NA     
 1st Qu.: 289.5   1st Qu.: 350.2        1st Qu.: NA     
 Median : 383.4   Median : 472.2        Median : NA     
 Mean   : 452.0   Mean   : 553.8        Mean   :NaN     
 3rd Qu.: 568.2   3rd Qu.: 699.6        3rd Qu.: NA     
 Max.   :1898.3   Max.   :2132.6        Max.   : NA     
                  NA's   :115362        NA's   :299301  
 PrimarySubscriberAndOneDependent
 Min.   : NA                     
 1st Qu.: NA                     
 Median : NA                     
 Mean   :NaN                     
 3rd Qu.: NA                     
 Max.   : NA                     
 NA's   :299301                  
 PrimarySubscriberAndTwoDependents
 Min.   : NA                      
 1st Qu.: NA                      
 Median : NA                      
 Mean   :NaN                      
 3rd Qu.: NA                      
 Max.   : NA                      
 NA's   :299301                   
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : NA                              
 1st Qu.: NA                              
 Median : NA                              
 Mean   :NaN                              
 3rd Qu.: NA                              
 Max.   : NA                              
 NA's   :299301                           
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : NA           Min.   : NA           
 1st Qu.: NA           1st Qu.: NA           
 Median : NA           Median : NA           
 Mean   :NaN           Mean   :NaN           
 3rd Qu.: NA           3rd Qu.: NA           
 Max.   : NA           Max.   : NA           
 NA's   :299301        NA's   :299301        
 CoupleAndThreeOrMoreDependents
 Min.   : NA                   
 1st Qu.: NA                   
 Median : NA                   
 Mean   :NaN                   
 3rd Qu.: NA                   
 Max.   : NA                   
 NA's   :299301                
plot(expensiveHI$IndividualRate, expensiveHI$IndividualTobaccoRate)

hist(expensiveHI$IndividualTobaccoRate, breaks=50, xlab="IndividualTobaccoRate", main="Individual Tobacco Rate distribution")

hist(log10(expensiveHI$IndividualTobaccoRate), breaks=50, xlab="Individual Tobacco Rate", main="Individual Tobacco Rates: with log transformation")

ggplot(expensiveHI, aes(x=StateCode, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black"))

summary(expensiveHI$FederalTIN)
06-6033492 13-5123390 13-5581829 14-1917982 20-3174593 
      4347          0       4282          0       1975 
20-8937577 26-0155137 26-3238817 27-3835905 31-1069321 
      1975        220      16222       2902      20316 
34-0648820 35-0472300 35-0781558 36-3097810 36-3691770 
         0          0      13838      45618          0 
39-1263473 39-1821211 45-3131932 45-4370907 46-1509576 
       228       1990       1741       1199        542 
46-2210067 47-0098400 47-0397286 58-0469845 58-1592076 
       770          0          0          0       7153 
58-1638390 58-2209549 58-2335921 59-1031071 59-1419293 
      9641       2018       1488       7600        113 
59-2015694 59-2403696 59-2876465 59-3315064 61-1013183 
     77087      33956          0       3150        227 
61-1103898 63-0103830 63-1028262 65-0073323 65-0743731 
      1744       8342          0          0          0 
65-0986441 75-1233841 86-0004538 86-0207231 86-0274899 
      1559          0       7702      15039          0 
86-0307623 91-1857813 93-0242990 93-0438772 93-0989307 
         0          0          0          0       4317 
94-2761537 95-6042390 
         0          0 
plot(expensiveHI$StateCode, expensiveHI$FederalTIN)

ggplot(expensiveHI, aes(x=Tobacco, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black"))

ggplot(expensiveHI, aes(x=Age, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black")) 

LS0tDQp0aXRsZTogIkhlYWx0aCBDYXJlIEluc3VyYW5jZSBNYXJrZXRwbGFjZSINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQojIyBBLiBEYXRhIFByZXBhcmF0aW9uDQoNCkRhdGEgcHJlcGFyYXRpb24gYW5kIGxvYWRpbmc6DQpgYGB7cn0NCnNldHdkKCJ+L2RyZXhlbF9tYXN0ZXJzL2RhdGEvSU5GTzY1OSIpDQpgYGANCg0KYGBge3J9DQpISU1QIDwtIHJlYWQuY3N2KCJmaW5hbC9ISV9yYXRlLmNzdiIpDQpgYGANCg0KYGBge3J9DQpoZWFkKEhJTVAsIDUpDQpgYGANCg0KQ3JlYXRpbmcgc3Vic2V0IGZvciBhbmFseXNpcw0KDQpgYGB7cn0NCkhJIDwtIEhJTVBbLGMoMyw5LDEyLDE0LDE1LDE2LDE3LDE4LDE5LDIwLDIxLDIyLDIzLDI0KV0NCmBgYA0KDQpgYGB7cn0NCmhlYWQoSEksNSkNCmBgYA0KDQpgYGB7cn0NCm5yb3coSEkpDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KEhJKQ0KYGBgDQoNClRoZSBjb2x1bW4gIlRvYmFjY28iIGNvbnRhaW5zIHR3byB2YWx1ZXM6ICJObyBwcmVmZXJlbmNlIiAoMzY2LDQ1MSBhY2NvdW50cykgYW5kICJUb2JhY2NvIFVzZXIvTm9uLVRvYmFjY28gVXNlciIgKDE4Myw1NDggYWNjb3VudHMpLiBUaGVyZWZvcmUsIGFsbW9zdCAyLzMgY29uc3VtZXJzIHdlcmUgZW5yb2xsZWQgaW4gcGxhbnMgdGhhdCBkaWQgbm90IGRldGVybWluZSBhbiBpbnN1cmFuY2UgcHJlbWl1bSBiYXNlZCBvbiBzbW9raW5nIHN0YXR1cy4gIA0KDQpUaGUgdmFsdWVzIG9mIHRoZSB2YXJpYWJsZSAiSW5kaXZpZHVhbFJhdGUicmFuZ2VkIGZyb20gMCB0byAkOTk5LDk5OS4wLiBUaGUgYWNjb3VudHMgd2l0aCB2YWx1ZXMgb2YgIjAiIGFuZCAiOTk5LDk5OS4wIiBhcmUgdW5leHBlY3RlZCB2YWx1ZXMuIFRoZXkgZG8gbm90IG1ha2Ugc2Vuc2UgYW5kIHNob3VsZCBiZSBjb25zaWRlcmVkIGFzIG91dGxpZXJzLiBUaGVyZWZvcmUsIHRoZXNlIGFjY291bnRzIHNob3VsZCBiZSBleGNsdWRlZCBmcm9tIGZ1cnRoZXIgYW5hbHlzZXMuIA0KDQpgYGB7cn0NCm91dGxpZXIxIDwtIHN1YnNldChISSwgSW5kaXZpZHVhbFJhdGUgPT0wKQ0Kb3V0bGllcjENCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkob3V0bGllcjEpDQpgYGANCg0KT25seSBmb3VyIEZlZGVyYWxUaW5zIHdlcmUgYXNzb2NpYXRlZCB3aXRoIHRoZSAiMCIgb3V0bGllcnMNCg0KDQpgYGB7cn0NCm91dGxpZXIyIDwtIHN1YnNldChISSwgSW5kaXZpZHVhbFJhdGUgPT05OTk5OTkuMCkNCm91dGxpZXIyDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KG91dGxpZXIyKQ0KYGBgDQoNCk9ubHkgb25lIEZlZGVyYWxUaW4gd2FzIGFzc29jaWF0ZWQgd2l0aCB0aGUgJDk5OSw5OTkgb3V0bGllcnMgKHRoZSBCZXN0IExpZmUgQW5kIEhlYWx0aCBJbnN1cmFuY2UgQ29tcGFueSkuICANCg0KQ3JlYXRpbmcgdGhlIG5ldyBmaWx0ZXJlZCBzdWJzZXQgd2l0aG91dCBvdXRsaWVyczoNCg0KYGBge3J9DQpmaWx0ZXJlZEhJMSA8LSBzdWJzZXQoSEksIEluZGl2aWR1YWxSYXRlIDw5OTk5OTkuMCkNCm5ld0hJIDwtIHN1YnNldChmaWx0ZXJlZEhJMSwgSW5kaXZpZHVhbFJhdGUgPjApDQpzdW1tYXJ5KG5ld0hJKQ0KYGBgDQpgYGB7cn0NCm5yb3cobmV3SEkpDQpgYGANCg0KIyMgQS4yLiBEYXRhIGRpc3RyaWJ1dGlvbiBhbmQgYW5vbWFsaWVzDQoNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQoNCiMjIyMgU3RhdGUgY29kZSB2cyBJbmRpdmlkdWFsUmF0ZQ0KDQpgYGB7cn0NCmdncGxvdChuZXdISSwgYWVzKHg9U3RhdGVDb2RlLCB5PUluZGl2aWR1YWxSYXRlKSkgKyANCiAgZ2VvbV9ib3hwbG90KCkgKyBzY2FsZV9jb2xvcl9tYW51YWwodmFsdWVzPWMoImJsYWNrIiwiYmxhY2siKSkgDQpgYGANCg0KVGhlIHZpc3VhbGl6YXRpb24gb2YgdGhlIGRpc3RyaWJ1dGlvbiBvZiB0aGUgaW5zdXJhbmNlIHJhdGUgYWNyb3NzIHRoZSBpbmRpdmlkdWFsIHN0YXRlcyBzaG93ZWQgbW9yZSBvdXRsaWVycyB3aXRoIHZhbHVlICI5LDk5OS4wIi4gT25seSBvbmUgRmVkZXJhbFRJTiAoYmVsb25ncyB0byB0aGUgQ2lnbmEgSGVhbHRoIGFuZCBMaWZlIEluc3VyYW5jZSBDb21wYW55KSB3YXMgYXNzb2NpYXRlZCB3aXRoIHRoZSAkOTk5OSBvdXRsaWVycy4gVGhlc2Ugb3V0bGllcnMgd2lsbCBiZSBleGNsdWRlZCBmcm9tIHRoZSBkYXRhLiAgDQoNCmBgYHtyfQ0Kb3V0bGllcjMgPC0gc3Vic2V0KG5ld0hJLCBJbmRpdmlkdWFsUmF0ZSA9PTk5OTkuMCkNCnN1bW1hcnkob3V0bGllcjMpDQpgYGANCg0KQ3JlYXRpbmcgdGhlIGZpbmFsIHN1YnNldCBmb3Igb3VyIGFuYWx5c2lzDQoNCmBgYHtyfQ0KZmluYWxISSA8LSBzdWJzZXQobmV3SEksIEluZGl2aWR1YWxSYXRlIDw5OTk5LjApDQpzdW1tYXJ5KGZpbmFsSEkpDQpgYGANCg0KYGBge3J9DQpWaWV3KGZpbmFsSEkpDQpgYGANCg0KUGxvdHRpbmcgU3RhdGUgY29kZSB2cyBJbmRpdmlkdWFsUmF0ZQ0KDQpgYGB7cn0NCmdncGxvdChmaW5hbEhJLCBhZXMoeD1TdGF0ZUNvZGUsIHk9SW5kaXZpZHVhbFJhdGUpKSArIA0KICBnZW9tX2JveHBsb3QoKSArIHNjYWxlX2NvbG9yX21hbnVhbCh2YWx1ZXM9YygiYmxhY2siLCJibGFjayIpKSANCmBgYA0KDQoNCg0KIyMgRmVkZXJhbFRJTg0KDQpgYGB7cn0NCmdncGxvdChmaW5hbEhJLCBhZXMoeD1GZWRlcmFsVElOLCB5PUluZGl2aWR1YWxSYXRlKSkgKyANCiAgZ2VvbV9ib3hwbG90KCkgKyBzY2FsZV9jb2xvcl9tYW51YWwodmFsdWVzPWMoImJsYWNrIiwiYmxhY2siKSkNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkoZmluYWxISSRGZWRlcmFsVElOKQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdChmaW5hbEhJJFN0YXRlQ29kZSwgZmluYWxISSRGZWRlcmFsVElOKQ0KYGBgDQoNCg0KYGBge3J9DQpnZ3Bsb3QoZmluYWxISSwgYWVzKHg9VG9iYWNjbywgeT1JbmRpdmlkdWFsUmF0ZSkpICsgDQogIGdlb21fYm94cGxvdCgpICsgc2NhbGVfY29sb3JfbWFudWFsKHZhbHVlcz1jKCJibGFjayIsImJsYWNrIikpDQpgYGANCg0KIyMjIEFnZSB2cy4gSW5kaXZpZHVhbCBSYXRlDQoNCmBgYHtyfQ0KZ2dwbG90KGZpbmFsSEksIGFlcyh4PUFnZSwgeT1JbmRpdmlkdWFsUmF0ZSkpICsgDQogIGdlb21fYm94cGxvdCgpIA0KYGBgDQpQcm9kdWNpbmcgYSBoaXN0b2dyYW0gZm9yIHRoZSBpbmRpdmlkdWFsIGluc3VyYW5jZSByYXRlcw0KDQpgYGB7cn0NCmhpc3QoZmluYWxISSRJbmRpdmlkdWFsUmF0ZSwgYnJlYWtzPTUwLCB4bGFiPSJJbmRpdmlkdWFsUmF0ZSIsIG1haW49IkluZGl2aWR1YWwgUmF0ZSBWYWx1ZXMgZGlzdHJpYnV0aW9uIikNCmBgYA0KDQpXaXRob3V0IHRoZSBvdXRsaWVycywgdGhlIHZhbHVlcyBvZiB0aGUgaW5kaXZpZHVhbCBoZWFsdGggaW5zdXJhbmNlIHJhdGVzIHJhbmdlZCBmcm9tICQ3LjgyIHRvICQxLjg5OC4yOCB3aXRoIG1lZGlhbiAkMjM2LjM2IGFuZCAkMjY1LjUzLiBUaGUgaGlzdG9ncmFtIHNoYXBlIGlzIHNrZXdlZCB0byB0aGUgbGVmdCAoRmlnLjIpLiBBbG1vc3QgaGFsZiBpbmRpdmlkdWFsIHJhdGVzIGFyZSBpbiB0aGUgZ3JvdXBzIGJlbG93ICQxMDAuIA0KDQpQcm9kdWNpbmcgYSBoaXN0b2dyYW0gZm9yIGluZGl2aWR1YWwgcmF0ZXMgd2l0aCBsb2cgdHJhbnNmb3JtYXRpb24NCg0KYGBge3J9DQpoaXN0KGxvZzEwKGZpbmFsSEkkSW5kaXZpZHVhbFJhdGUpLCBicmVha3M9NTAsIHhsYWI9IkluZGl2aWR1YWwgUmF0ZSIsIG1haW49IkluZGl2aWR1YWwgUmF0ZXM6IHdpdGggbG9nIHRyYW5zZm9ybWF0aW9uIikNCmBgYA0KDQpXaXRoIGxvZyB0cmFuc2Zvcm1hdGlvbiwgdGhlIGRpc3RyaWJ1dGlvbiBvZiBpbmRpdmlkdWFsIHJhdGVzIGJlY2FtZSBiaW1vZGFsIHdpdGggdHdvIHBlYWtzIG9mIGRhdGEsIHdoaWNoIHVzdWFsbHkgaW5kaWNhdGVzIHRoZSBwcmVzZW5jZSBvZiB0d28gZGlmZmVyZW50IGdyb3VwcyAoRmlnLjMpLiBUaGVyZWZvcmUsIHdlIG5lZWQgdG8gZmluZCBvdXQgaG93IGRpZmZlcmVudCB0aGVzZSBncm91cHMgYXJlLiBCYXNlZCBvbiB0aGUgaGlzdG9ncmFtIChGaWcuIDIpLCB3ZSBhc3N1bWVkIHRoYXQgdGhlIGZpcnN0IGdyb3VwIGluY2x1ZGVkIGluZGl2aWR1YWwgcmF0ZXMgd2l0aCB2YWx1ZXMgYmVsb3cgJDEwMCBhbmQgdGhlIHNlY29uZCBncm91cCBpbmNsdWRlZCBhY2NvdW50cyB3aXRoIHJhdGVzIGFib3ZlICQxMDAuIEZvciBmdXJ0aGVyIGFuYWx5c2lzLCB3ZSBjcmVhdGVkIHR3byBkaWZmZXJlbnQgc3Vic2V0czogIkNoZWFwIEhlYWx0aCBJbnN1cmFuY2UiIGFuZCAiRXhwZW5zaXZlIEhlYWx0aCBJbnN1cmFuY2UuIiAgIA0KDQoNCmBgYHtyfQ0KY2hlYXBISSA8LSBzdWJzZXQoZmluYWxISSwgSW5kaXZpZHVhbFJhdGU8PTEwMCkNCnN1bW1hcnkoY2hlYXBISSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChjaGVhcEhJLCBhZXMoeD1BZ2UsIHk9SW5kaXZpZHVhbFJhdGUpKSArIA0KICBnZW9tX2JveHBsb3QoKSArIHNjYWxlX2NvbG9yX21hbnVhbCh2YWx1ZXM9YygiYmxhY2siLCJibGFjayIpKSANCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkoY2hlYXBISSRGZWRlcmFsVElOKQ0KYGBgDQoNCmBgYHtyfQ0KDQpwbG90KGNoZWFwSEkkU3RhdGVDb2RlLCBjaGVhcEhJJEZlZGVyYWxUSU4pDQpgYGANCg0KDQoNCmBgYHtyfQ0KcGxvdChjaGVhcEhJJEluZGl2aWR1YWxSYXRlLCBjaGVhcEhJJENvdXBsZSkNCmBgYA0KDQpgYGB7cn0NCmV4cGVuc2l2ZUhJIDwtIHN1YnNldChmaW5hbEhJLCBJbmRpdmlkdWFsUmF0ZT4xMDApDQpzdW1tYXJ5KGV4cGVuc2l2ZUhJKQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdChleHBlbnNpdmVISSRJbmRpdmlkdWFsUmF0ZSwgZXhwZW5zaXZlSEkkSW5kaXZpZHVhbFRvYmFjY29SYXRlKQ0KYGBgDQoNCmBgYHtyfQ0KaGlzdChleHBlbnNpdmVISSRJbmRpdmlkdWFsVG9iYWNjb1JhdGUsIGJyZWFrcz01MCwgeGxhYj0iSW5kaXZpZHVhbFRvYmFjY29SYXRlIiwgbWFpbj0iSW5kaXZpZHVhbCBUb2JhY2NvIFJhdGUgZGlzdHJpYnV0aW9uIikNCmBgYA0KDQpgYGB7cn0NCmhpc3QobG9nMTAoZXhwZW5zaXZlSEkkSW5kaXZpZHVhbFRvYmFjY29SYXRlKSwgYnJlYWtzPTUwLCB4bGFiPSJJbmRpdmlkdWFsIFRvYmFjY28gUmF0ZSIsIG1haW49IkluZGl2aWR1YWwgVG9iYWNjbyBSYXRlczogd2l0aCBsb2cgdHJhbnNmb3JtYXRpb24iKQ0KYGBgDQoNCmBgYHtyfQ0KZ2dwbG90KGV4cGVuc2l2ZUhJLCBhZXMoeD1TdGF0ZUNvZGUsIHk9SW5kaXZpZHVhbFJhdGUpKSArIA0KICBnZW9tX2JveHBsb3QoKSArIHNjYWxlX2NvbG9yX21hbnVhbCh2YWx1ZXM9YygiYmxhY2siLCJibGFjayIpKQ0KYGBgDQoNCmBgYHtyfQ0Kc3VtbWFyeShleHBlbnNpdmVISSRGZWRlcmFsVElOKQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdChleHBlbnNpdmVISSRTdGF0ZUNvZGUsIGV4cGVuc2l2ZUhJJEZlZGVyYWxUSU4pDQpgYGANCg0KYGBge3J9DQpnZ3Bsb3QoZXhwZW5zaXZlSEksIGFlcyh4PVRvYmFjY28sIHk9SW5kaXZpZHVhbFJhdGUpKSArIA0KICBnZW9tX2JveHBsb3QoKSArIHNjYWxlX2NvbG9yX21hbnVhbCh2YWx1ZXM9YygiYmxhY2siLCJibGFjayIpKQ0KYGBgDQoNCmBgYHtyfQ0KZ2dwbG90KGV4cGVuc2l2ZUhJLCBhZXMoeD1BZ2UsIHk9SW5kaXZpZHVhbFJhdGUpKSArIA0KICBnZW9tX2JveHBsb3QoKSArIHNjYWxlX2NvbG9yX21hbnVhbCh2YWx1ZXM9YygiYmxhY2siLCJibGFjayIpKSANCmBgYA0KDQo=