A. Data Preparation

A.1. Data Loading and Initial Transformation

Load data:

setwd("~/drexel_masters/data/INFO659")
HIMP <- read.csv("final/HI_rate.csv")
head(HIMP, 5)

Creating subset for analysis

HI <- HIMP[,c(3,9,12,13,14,15,16,17,18,19,20,21,22,23,24)]
head(HI,5)
nrow(HI)
[1] 549999
summary(HI)
 StateCode        FederalTIN                PlanId      
 AK:  5657   59-2015694: 77089   48129FL0160004:  6586  
 AL: 40564   86-0307623: 60713   48129FL0160002:  6545  
 AZ: 75624   47-0397286: 52441   48129FL0160001:  6537  
 FL:316390   36-3097810: 45683   16842FL0010001:  6510  
 GA: 59671   59-2403696: 33956   16842FL0010002:  6480  
 IN: 52093   94-2761537: 33864   48129FL0160003:  6329  
             (Other)   :246253   (Other)       :511012  
        RatingAreaId                             Tobacco      
 Rating Area 5: 31743   No Preference                :366451  
 Rating Area 4: 30439   Tobacco User/Non-Tobacco User:183548  
 Rating Area 6: 28668                                         
 Rating Area 2: 28311                                         
 Rating Area 3: 28001                                         
 Rating Area 1: 25233                                         
 (Other)      :377604                                         
      Age         IndividualRate     IndividualTobaccoRate
 26     : 12122   Min.   :     0.0   Min.   :  67.2       
 44     : 12107   1st Qu.:    25.7   1st Qu.: 350.0       
 38     : 12088   Median :   233.0   Median : 472.0       
 47     : 12051   Mean   :  6917.9   Mean   : 553.6       
 0-20   : 12036   3rd Qu.:   413.6   3rd Qu.: 699.4       
 52     : 12028   Max.   :999999.0   Max.   :2132.6       
 (Other):477567                      NA's   :365958       
     Couple       PrimarySubscriberAndOneDependent
 Min.   : 15.6    Min.   : 20.9                   
 1st Qu.: 29.2    1st Qu.: 29.7                   
 Median : 38.9    Median : 41.7                   
 Mean   : 43.2    Mean   : 44.1                   
 3rd Qu.: 56.0    3rd Qu.: 55.6                   
 Max.   :182.4    Max.   :169.7                   
 NA's   :548575   NA's   :548575                  
 PrimarySubscriberAndTwoDependents
 Min.   : 24.0                    
 1st Qu.: 30.9                    
 Median : 62.0                    
 Mean   : 60.3                    
 3rd Qu.: 81.0                    
 Max.   :248.1                    
 NA's   :548575                   
 PrimarySubscriberAndThreeOrMoreDependents
 Min.   : 24.0                            
 1st Qu.: 30.9                            
 Median : 85.2                            
 Mean   : 79.7                            
 3rd Qu.:112.2                            
 Max.   :357.9                            
 NA's   :548575                           
 CoupleAndOneDependent CoupleAndTwoDependents
 Min.   : 24.0         Min.   : 24.0         
 1st Qu.: 31.8         1st Qu.: 33.8         
 Median : 62.1         Median : 78.5         
 Mean   : 62.4         Mean   : 78.5         
 3rd Qu.: 82.7         3rd Qu.:109.4         
 Max.   :260.9         Max.   :339.3         
 NA's   :548575        NA's   :548575        
 CoupleAndThreeOrMoreDependents
 Min.   : 24.0                 
 1st Qu.: 33.8                 
 Median :105.8                 
 Mean   : 98.0                 
 3rd Qu.:139.5                 
 Max.   :449.1                 
 NA's   :548575                
library(ggplot2)

A.2. Variables exploration

State code

ggplot(HI, aes(x=StateCode, y=IndividualRate)) + 
  geom_boxplot() + scale_color_manual(values=c("black","black")) 

LS0tDQp0aXRsZTogImhlYWx0aF9pbnN1cmFuY2VfZHJhZnQiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KIyMgQS4gRGF0YSBQcmVwYXJhdGlvbg0KDQojIyMgQS4xLiBEYXRhIExvYWRpbmcgYW5kIEluaXRpYWwgVHJhbnNmb3JtYXRpb24NCg0KIyMjIyBMb2FkIGRhdGE6DQoNCmBgYHtyfQ0Kc2V0d2QoIn4vZHJleGVsX21hc3RlcnMvZGF0YS9JTkZPNjU5IikNCmBgYA0KDQoNCmBgYHtyfQ0KSElNUCA8LSByZWFkLmNzdigiZmluYWwvSElfcmF0ZS5jc3YiKQ0KYGBgDQoNCmBgYHtyfQ0KaGVhZChISU1QLCA1KQ0KYGBgDQoNCiMjIyMgQ3JlYXRpbmcgc3Vic2V0IGZvciBhbmFseXNpcw0KDQpgYGB7cn0NCkhJIDwtIEhJTVBbLGMoMyw5LDEyLDEzLDE0LDE1LDE2LDE3LDE4LDE5LDIwLDIxLDIyLDIzLDI0KV0NCmBgYA0KDQpgYGB7cn0NCmhlYWQoSEksNSkNCmBgYA0KDQpgYGB7cn0NCm5yb3coSEkpDQpgYGANCg0KYGBge3J9DQpzdW1tYXJ5KEhJKQ0KYGBgDQoNCmBgYHtyfQ0KbGlicmFyeShnZ3Bsb3QyKQ0KYGBgDQoNCiMjIyBBLjIuIFZhcmlhYmxlcyBleHBsb3JhdGlvbg0KDQojIyMjIFN0YXRlIGNvZGUNCg0KYGBge3J9DQpnZ3Bsb3QoSEksIGFlcyh4PVN0YXRlQ29kZSwgeT1JbmRpdmlkdWFsUmF0ZSkpICsgDQogIGdlb21fYm94cGxvdCgpICsgc2NhbGVfY29sb3JfbWFudWFsKHZhbHVlcz1jKCJibGFjayIsImJsYWNrIikpIA0KYGBgDQoNCg==