Problem Description & Current State of Affairs

Medicare.gov released recent two years’s data about “Drug and Health Plan” coverage around the country. The data inclues who, when, where, and how the plans were used. There are many plans with low ratings. And we want to find out how predictors such as political affiliation, income level, insurance rates, unemployment rates, hospital type, or happiness index contribute to the final rating of one specific type of plan. This is where we start to build the predicition modeling. It could be taken for reference for future plan upgrading or new plan release.

Algorithms Used and Rationale

Algorithms Rationale
Logistic Regession, Naive Bayes, Decision Tree, ANN Since the project is one supervised problem, we are going to apply multiple prediction models to the datasets. And we will choose the final one with the highest accuracy.

Drug and Health Plan with copay, deductible, coinsurance

drug_cost <- read.csv("./data/2015Med2000_flatfiles/vwPlanDrugsCostSharing.csv", stringsAsFactors = FALSE, na.strings = c("Information Not Available", ""))
# data points before preprocessing
str(drug_cost)
## 'data.frame':    5227 obs. of  28 variables:
##  $ contract_id                    : chr  "90091" "E0654" "E0654" "E0654" ...
##  $ segment_id                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ plan_id                        : int  999 801 802 803 801 801 801 801 801 1 ...
##  $ contract_year                  : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ language_id                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ plan_cvrg_typ_id               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mnthly_prm                     : num  0 33 33 33 33 33 33 33 33 0 ...
##  $ ann_ddctbl                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ icl_lmt                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ thrshld                        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ copay_amt_inn_prefrd           : chr  NA NA NA NA ...
##  $ copay_amt_inn_prefrd_lvl       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ coins_amt_inn_prefrd           : chr  NA NA NA NA ...
##  $ coins_amt_inn_prefrd_lvl       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copay_amt_inn_general          : chr  NA NA NA NA ...
##  $ copay_amt_inn_general_lvl      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ coins_amt_inn_general          : chr  NA NA NA NA ...
##  $ coins_amt_inn_general_lvl      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copay_amt_mail_ordr_prefrd     : chr  NA NA NA NA ...
##  $ copay_amt_mail_ordr_prefrd_lvl : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ coins_amt_mail_ordr_prefrd     : chr  NA NA NA NA ...
##  $ coins_amt_mail_ordr_prefrd_lvl : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copay_amt_mail_ordr_general    : chr  NA NA NA NA ...
##  $ copay_amt_mail_ordr_general_lvl: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ coins_amt_mail_ordr_general    : chr  NA NA NA NA ...
##  $ coins_amt_mail_ordr_general_lvl: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ is_mdcr_dfnd_std_yn            : chr  " " "N" "N" "N" ...
##  $ post_thrshld_mdcr_std_cost     : int  0 0 0 0 0 0 0 0 0 0 ...
# keep the copay, deductible, and coinsurance
colnames <- c("contract_id", "plan_id", "mnthly_prm", "ann_ddctbl", "copay_amt_inn_prefrd", "coins_amt_inn_prefrd")
drug_cost2 <- drug_cost[,colnames]
drug_cost3 <- drug_cost2
# split the range into standalone characters
drug_cost3$copay_low <- sapply(strsplit(drug_cost2$copay_amt_inn_prefrd, split="-"), '[', 1)
drug_cost3$copay_high <- sapply(strsplit(drug_cost2$copay_amt_inn_prefrd, split="-"), '[', 2)

drug_cost3$coin_low <- sapply(strsplit(drug_cost2$coins_amt_inn_prefrd, split="-"), '[', 1)
drug_cost3$coin_high <- sapply(strsplit(drug_cost2$coins_amt_inn_prefrd, split="-"), '[', 2)
# remove $ and %
library(gdata)
drug_cost3$copay_low <- gsub("^\\$", "", trim(drug_cost3$copay_low))
drug_cost3$copay_high <- gsub("^\\$", "", trim(drug_cost3$copay_high))
drug_cost3$coin_low <- gsub("%$", "", trim(drug_cost3$coin_low))
drug_cost3$coin_high <- gsub("%$", "", trim(drug_cost3$coin_high))
# drug_cost3 <- na.omit(drug_cost3)
#change NA to 0
drug_cost3$copay_low[is.na(drug_cost3$copay_low)] <- 0
drug_cost3$copay_high[is.na(drug_cost3$copay_high)] <- 0
drug_cost3$coin_low[is.na(drug_cost3$coin_low)] <- 0
drug_cost3$coin_high[is.na(drug_cost3$coin_high)] <- 0
# choose the maxium value
drug_cost3$copay_max <- apply(drug_cost3[, c("copay_low", "copay_high")], 1, max)
drug_cost3$coin_max <- apply(drug_cost3[, c("coin_low", "coin_low")], 1, max)
# subset the needed matrix
drug_cost4 <- drug_cost3[, c("contract_id", "mnthly_prm", "ann_ddctbl", "copay_max", "coin_max")]
# change the column type to numeric
drug_cost4$ann_ddctbl <- as.numeric(drug_cost4$ann_ddctbl)
drug_cost4$copay_max <- as.numeric(drug_cost4$copay_max)
drug_cost4$coin_max <- as.numeric(drug_cost4$coin_max)
drug_cost4$mnthly_prm <- as.numeric(drug_cost4$mnthly_prm)
# aggreate the drug_cost by contract id
drug_cost4_aggr1 <- aggregate(drug_cost4[,c("mnthly_prm", "ann_ddctbl")], by = list(drug_cost4$contract_id), FUN = mean)
drug_cost4_aggr2 <- aggregate(drug_cost4[,c("copay_max", "coin_max")], by = list(drug_cost4$contract_id), FUN = max)
drug_cost4_aggr <- merge(drug_cost4_aggr1, drug_cost4_aggr2, all = TRUE, by.x = "Group.1", by.y = "Group.1")
colnames(drug_cost4_aggr) <- c("contract_id", "mnthly_prm", "ann_ddctbl", "copay_max", "coin_max")
#write.csv(drug_cost4_aggr, file = "./data/internal_drugcost.csv")
str(drug_cost4_aggr)
## 'data.frame':    816 obs. of  5 variables:
##  $ contract_id: chr  "90091" "E0654" "E2630" "E3014" ...
##  $ mnthly_prm : num  0 33 33 33 33 33 33 0 0 5.5 ...
##  $ ann_ddctbl : num  0 0 0 0 0 0 0 0 0 160 ...
##  $ copay_max  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ coin_max   : num  0 0 0 0 0 0 0 0 0 0 ...
tail(drug_cost4_aggr)
##     contract_id mnthly_prm ann_ddctbl copay_max coin_max
## 811       S7694   34.86364   261.8182        85       33
## 812       S7950   33.00000     0.0000         0        0
## 813       S8067   67.00000     0.0000        89       33
## 814       S8841   33.00000     0.0000         0        0
## 815       S9579   39.60870   153.0435         0        0
## 816       S9701   33.00000     0.0000         0        0

Drug and Health Cost Tier Level

# load vwPlanDrugTierCost.csv 
cost_tier <- read.csv("./data/2015Med2000_flatfiles/vwPlanDrugTierCost.csv", stringsAsFactors = FALSE)
# data points before preprocessing
str(cost_tier)
## 'data.frame':    374040 obs. of  12 variables:
##  $ Language          : chr  "English" "English" "English" "English" ...
##  $ segment_id        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ contract_id       : chr  "H0022" "H0022" "H0022" "H0022" ...
##  $ plan_id           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ contract_year     : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
##  $ language_id       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ tier_level        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ tier_label        : chr  "Generic Drugs" "Generic Drugs" "Generic Drugs" "Generic Drugs" ...
##  $ tier_type_desc    : chr  "Standard Retail Cost-Sharing 30 days" "Standard Mail Order Cost-Sharing gap 60 days" "Standard Mail Order Cost-Sharing 60 days" "Standard Mail Order Cost-Sharing gap 30 days" ...
##  $ cost_share_pref   : chr  "" "" "" "" ...
##  $ cost_share_gen    : chr  "" "" "" "" ...
##  $ exception_tier_ind: int  0 0 0 0 0 0 0 0 0 0 ...
# keep contract_id, plan_id, languange_id, tier_lable
colnames <- c("contract_id", "language_id", "tier_label")
cost_tier2 <- cost_tier[, colnames]
cost_tier2 <- cost_tier2[cost_tier2$language_id == "1", ]
cost_tier3 <- cost_tier2[, c("contract_id", "tier_label")]
# count unique tier numbers for each contract id
cost_tier4 <- aggregate(data = cost_tier3,  tier_label ~ contract_id, function(x) length(unique(x)))
#write.csv(drug_cost4_aggr, file = "./data/internal_costtier.csv")
str(cost_tier4)
## 'data.frame':    585 obs. of  2 variables:
##  $ contract_id: chr  "H0022" "H0028" "H0084" "H0104" ...
##  $ tier_label : int  3 5 5 5 5 5 4 4 4 3 ...
tail(cost_tier4)
##     contract_id tier_label
## 580       S6506          5
## 581       S7230          5
## 582       S7610          5
## 583       S7694          5
## 584       S8067          5
## 585       S9579          5

Durg and Health Plan Score Ratings

# load rating data
ratingdata <- read.csv("./data/2015StarRatings_flatfiles/vwStarRating_SummaryScores.csv",na.strings = c("Plan too new to be measured", "Not enough data available"), stringsAsFactors = FALSE)
# data points before preprocessing
str(ratingdata)
## 'data.frame':    3516 obs. of  5 variables:
##  $ Contract_ID         : chr  "E0654" "E0654" "E2630" "E2630" ...
##  $ Summary_Score       : chr  "Resumen de las calificaciones de la calidad de los planes de medicamentos recetados" "Summary Rating of Prescription Drug Plan Quality" "Resumen de las calificaciones de la calidad de los planes de medicamentos recetados" "Summary Rating of Prescription Drug Plan Quality" ...
##  $ Star_Rating_Current : chr  "3.5 de 5 estrellas" "3.5 out of 5 stars" "4.5 de 5 estrellas" "4.5 out of 5 stars" ...
##  $ Star_Rating_Previous: chr  "4 de 5 estrellas" "4 out of 5 stars" "4 de 5 estrellas" "4 out of 5 stars" ...
##  $ lang_dscrptn        : chr  "Spanish" "English" "Spanish" "English" ...
# remove Spanish Version
ratingdata2 <- ratingdata[ratingdata$lang_dscrptn=="English",]
# keep the columns of  "Contract_ID", "Summary_Score", "Star_Rating_Current"
colnames <- c("Contract_ID", "Summary_Score", "Star_Rating_Current")
ratingdata2 <- ratingdata2[, colnames]
# split the string to get the rating number
ratingdata2$Star_Rating_Current <- sapply(strsplit(ratingdata2$Star_Rating_Current, split=" "), '[', 1)
#ratingdata2$Star_Rating_Previous <- sapply(strsplit(ratingdata2$Star_Rating_Previous, split=" "), '[', 1)
# remove NAs of the current 2015 year
ratingdata2_nocurrna <- ratingdata2[(!is.na(ratingdata2$Star_Rating_Current)), ]
# subset the rating data
ratingdata3 <- ratingdata2_nocurrna[ratingdata2_nocurrna$Summary_Score=="Overall Star Rating", 1:3]
#write.csv(ratingdata3, file = "./data/ratingdata.csv")
str(ratingdata2_nocurrna)
## 'data.frame':    1214 obs. of  3 variables:
##  $ Contract_ID        : chr  "E0654" "E2630" "E3014" "E4744" ...
##  $ Summary_Score      : chr  "Summary Rating of Prescription Drug Plan Quality" "Summary Rating of Prescription Drug Plan Quality" "Summary Rating of Prescription Drug Plan Quality" "Summary Rating of Prescription Drug Plan Quality" ...
##  $ Star_Rating_Current: chr  "3.5" "4.5" "4.5" "4" ...
tail(ratingdata2_nocurrna)
##      Contract_ID                                    Summary_Score
## 3504       S7230 Summary Rating of Prescription Drug Plan Quality
## 3506       S7610 Summary Rating of Prescription Drug Plan Quality
## 3508       S7694 Summary Rating of Prescription Drug Plan Quality
## 3510       S8067 Summary Rating of Prescription Drug Plan Quality
## 3512       S8841 Summary Rating of Prescription Drug Plan Quality
## 3514       S9579 Summary Rating of Prescription Drug Plan Quality
##      Star_Rating_Current
## 3504                 2.5
## 3506                   3
## 3508                   3
## 3510                 3.5
## 3512                 2.5
## 3514                   3
#head(ratingdata3)

Intermediate Data to Connect Contract and Zipcode

zip_contract <- read.csv("./data/2015Med2000_flatfiles/vwLocalContractServiceAreas.csv", stringsAsFactors = FALSE)
# data points before preprocessing
str(zip_contract)
## 'data.frame':    419003 obs. of  4 variables:
##  $ County_code  : int  1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ Zip_Code     : int  36003 36003 36003 36003 36003 36003 36006 36006 36006 36006 ...
##  $ Contract_ID  : chr  "H0104" "H0150" "H0151" "H0154" ...
##  $ Contract_Year: int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
zip_contract2 <- unique(zip_contract[, c("Zip_Code", "Contract_ID")])
#write.csv(zip_contract2, file = "./data/internal_geography.csv")
str(zip_contract2)
## 'data.frame':    364847 obs. of  2 variables:
##  $ Zip_Code   : int  36003 36003 36003 36003 36003 36003 36006 36006 36006 36006 ...
##  $ Contract_ID: chr  "H0104" "H0150" "H0151" "H0154" ...
tail(zip_contract2)
##        Zip_Code Contract_ID
## 418993    82442       H4652
## 418995    82701       H5435
## 418996    82715       H4652
## 418997    82715       H5435
## 418998    82723       H4652
## 418999    82723       H5435

Internal Features Combination: by contract_id

cost <- merge(cost_tier4, drug_cost4_aggr, all = TRUE, by = "contract_id")
cost_rating <- merge(cost, ratingdata3, all = TRUE, by.x = "contract_id", by.y = "Contract_ID")
cost_rating_zip <- merge(cost_rating, zip_contract2, all = TRUE, by.x = "contract_id", by.y = "Contract_ID")
cost_rating_zip2 <- na.omit(cost_rating_zip)
str(cost_rating_zip2)
## 'data.frame':    227432 obs. of  9 variables:
##  $ contract_id        : chr  "H0028" "H0028" "H0028" "H0028" ...
##  $ tier_label         : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ mnthly_prm         : num  5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
##  $ ann_ddctbl         : num  160 160 160 160 160 160 160 160 160 160 ...
##  $ copay_max          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ coin_max           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Summary_Score      : chr  "Overall Star Rating" "Overall Star Rating" "Overall Star Rating" "Overall Star Rating" ...
##  $ Star_Rating_Current: chr  "4" "4" "4" "4" ...
##  $ Zip_Code           : int  50112 50105 50339 52305 52354 52319 96766 52329 50325 96747 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:137510] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr [1:137510] "1" "2" "3" "4" ...
tail(cost_rating_zip)
##        contract_id tier_label mnthly_prm ann_ddctbl copay_max coin_max
## 364937       S7694          5   34.86364   261.8182        85       33
## 364938       S7950         NA   33.00000     0.0000         0        0
## 364939       S8067          5   67.00000     0.0000        89       33
## 364940       S8841         NA   33.00000     0.0000         0        0
## 364941       S9579          5   39.60870   153.0435         0        0
## 364942       S9701         NA   33.00000     0.0000         0        0
##        Summary_Score Star_Rating_Current Zip_Code
## 364937          <NA>                <NA>       NA
## 364938          <NA>                <NA>       NA
## 364939          <NA>                <NA>       NA
## 364940          <NA>                <NA>       NA
## 364941          <NA>                <NA>       NA
## 364942          <NA>                <NA>       NA
#write.csv(cost_rating_zip2, file = "./data/internal_data.csv")

Unemployment Rate

unemployment_data <- read.table("./data/unemployment.txt", stringsAsFactors = FALSE)
# data points before preprocessing
str(unemployment_data)
## 'data.frame':    29252 obs. of  8 variables:
##  $ V1: chr  "1." "2." "3." "20." ...
##  $ V2: int  21031 23001 29202 96119 23943 95232 15347 60604 33620 99553 ...
##  $ V3: chr  "39.488697," "37.292591," "34.023588," "41.042003," ...
##  $ V4: num  -76.7 -76.4 -81 -120.5 -78.5 ...
##  $ V5: chr  "," "," "," "," ...
##  $ V6: chr  "59" "28" "227" "70" ...
##  $ V7: num  100 100 100 66.7 47.8 ...
##  $ V8: chr  "%" "%" "%" "%" ...
unemployment_data2 <- unemployment_data[, c(2, 6, 7)]
colnames(unemployment_data2) <- c("zipcode", "polulation", "unemployment_rate")
unemployment_data2$polulation <- as.numeric(gsub("[[:punct:]]", "", unemployment_data2$polulation))
unemployment_data2$unemployment_rate <- as.numeric(unemployment_data2$unemployment_rate)
#write.csv(cost_rating_zip2, file = "./data/external_unemploy.csv")
str(unemployment_data2)
## 'data.frame':    29252 obs. of  3 variables:
##  $ zipcode          : int  21031 23001 29202 96119 23943 95232 15347 60604 33620 99553 ...
##  $ polulation       : num  59 28 227 70 1080 ...
##  $ unemployment_rate: num  100 100 100 66.7 47.8 ...
tail(unemployment_data2)
##       zipcode polulation unemployment_rate
## 29247   22723         79                 0
## 29248   19435         73                 0
## 29249   19436        497                 0
## 29250   19980         75                 0
## 29251   21653        128                 0
## 29252   21790         78                 0

Total Income

income_data <- read.csv("./data/zipcode2013/zipcodenoagi13.csv", stringsAsFactors = FALSE)
# data points before preprocessing
# str(income_data)
income_data2 <- income_data[, c("ZIPCODE", "A02650")]
colnames(income_data2)[2] <- "total_income"
#write.csv(income_data2, file = "./data/external_income.csv")
str(income_data2)
## 'data.frame':    27790 obs. of  2 variables:
##  $ ZIPCODE     : int  0 35004 35005 35006 35007 35010 35014 35016 35019 35020 ...
##  $ total_income: num  1.06e+08 2.58e+05 1.29e+05 5.86e+04 6.51e+05 ...
tail(income_data2)
##       ZIPCODE total_income
## 27785   83123        21124
## 27786   83126         8319
## 27787   83127        88398
## 27788   83128        63379
## 27789   83414        22319
## 27790   99999      2154270

Physician

# creates number of doctors, average years of experience, and number of specializations BY COUNTY
# indexed by each zip code (e.g. 5 zip codes in one county would all have same info)
#read physician data set, select subset, remove non-complete rows
doctors <- read.csv('./data/physician.csv', stringsAsFactors = FALSE)
str(doctors)
## 'data.frame':    268933 obs. of  43 variables:
##  $ NPI                                                         : int  1003001843 1891805826 1679797591 1124225222 1225065907 1972618387 1316121445 1932372281 1942246673 1750329595 ...
##  $ PAC.ID                                                      : num  5.60e+09 7.42e+09 4.59e+09 4.46e+08 9.54e+09 ...
##  $ Professional.Enrollment.ID                                  : chr  "I20050301000433" "I20120727000166" "I20050818000907" "I20071207000432" ...
##  $ Last.Name                                                   : chr  "LUECHINGER" "MATHIS" "MELNICK" "COOKE" ...
##  $ First.Name                                                  : chr  "INGE" "CYNTHIA" "JOSEPH" "ARYIAN" ...
##  $ Middle.Name                                                 : chr  "C" "M" "" "L" ...
##  $ Suffix                                                      : chr  "" "" "" "" ...
##  $ Gender                                                      : chr  "F" "F" "M" "F" ...
##  $ Credential                                                  : chr  "CNS" "" "CP" "" ...
##  $ Medical.school.name                                         : chr  "OTHER" "HAHNEMANN UNIVERSITY COLLEGE OF MEDICINE" "OTHER" "MEDICAL COLLEGE OF WISCONSIN" ...
##  $ Graduation.year                                             : int  1998 1992 1971 2003 1972 1976 2002 2008 1984 2000 ...
##  $ Primary.specialty                                           : chr  "CLINICAL NURSE SPECIALIST" "PHYSICAL MEDICINE AND REHABILITATION" "CLINICAL PSYCHOLOGIST" "OBSTETRICS/GYNECOLOGY" ...
##  $ Secondary.specialty.1                                       : chr  "" "" "" "" ...
##  $ Secondary.specialty.2                                       : chr  "" "" "" "" ...
##  $ Secondary.specialty.3                                       : chr  "" "" "" "" ...
##  $ Secondary.specialty.4                                       : chr  "" "" "" "" ...
##  $ All.secondary.specialties                                   : chr  "" "" "" "" ...
##  $ Organization.legal.name                                     : chr  "FAMILY THERAPY PC" "CYNTHIA M. MATHIS, MD, PC" "PORTLAND CENTER FOR COUNSELING AND PSYCHOTHERAPY" "WALDORF WOMENS CARE" ...
##  $ Organization.DBA.name                                       : logi  NA NA NA NA NA NA ...
##  $ Group.Practice.PAC.ID                                       : num  2.26e+09 4.46e+08 2.96e+09 3.68e+09 2.86e+09 ...
##  $ Number.of.Group.Practice.members                            : int  1 1 1 2 1 1 1 3 1 1 ...
##  $ Line.1.Street.Address                                       : chr  "1296 PARKER SERD" "6221 WILSHIRE BLVD" "178 MIDDLE ST" "11355 PEMBROOKE SQUARE" ...
##  $ Line.2.Street.Address                                       : chr  "" "" "" "" ...
##  $ Marker.of.address.line.2.suppression                        : chr  "N" "Y" "Y" "N" ...
##  $ City                                                        : chr  "CONYERS" "LOS ANGELES" "PORTLAND" "WALDORF" ...
##  $ State                                                       : chr  "GA" "CA" "ME" "MD" ...
##  $ Zip.Code                                                    : chr  "300945951" "900485215" "041014075" "206034805" ...
##  $ Claims.based.hospital.affiliation.CCN.1                     : int  NA 50625 NA 210035 NA NA NA NA NA NA ...
##  $ Claims.based.hospital.affiliation.LBN.1                     : chr  "" "CEDARS-SINAI MEDICAL CENTER" "" "CIVISTA MEDICAL CENTER INC." ...
##  $ Claims.based.hospital.affiliation.CCN.2                     : chr  "" "" "" "" ...
##  $ Claims.based.hospital.affiliation.LBN.2                     : chr  "" "" "" "" ...
##  $ Claims.based.hospital.affiliation.CCN.3                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Claims.based.hospital.affiliation.LBN.3                     : chr  "" "" "" "" ...
##  $ Claims.based.hospital.affiliation.CCN.4                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Claims.based.hospital.affiliation.LBN.4                     : chr  "" "" "" "" ...
##  $ Claims.based.hospital.affiliation.CCN.5                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Claims.based.hospital.affiliation.LBN.5                     : chr  "" "" "" "" ...
##  $ Professional.accepts.Medicare.Assignment                    : chr  "M" "Y" "Y" "Y" ...
##  $ Participating.in.eRx                                        : logi  NA NA NA NA NA NA ...
##  $ Participating.in.PQRS                                       : chr  "" "" "" "" ...
##  $ Participating.in.EHR                                        : chr  "" "" "" "" ...
##  $ Received.PQRS.Maintenance.of.Certification.Program.Incentive: chr  "" "" "" "" ...
##  $ Participated.in.Million.Hearts                              : chr  "" "" "" "" ...
myVars <- c("NPI", "Graduation.year","Primary.specialty","City", "State", "Zip.Code")
doctors <- doctors[myVars]
doctors <- doctors[complete.cases(doctors),]
str(doctors)
## 'data.frame':    266552 obs. of  6 variables:
##  $ NPI              : int  1003001843 1891805826 1679797591 1124225222 1225065907 1972618387 1316121445 1932372281 1942246673 1750329595 ...
##  $ Graduation.year  : int  1998 1992 1971 2003 1972 1976 2002 2008 1984 2000 ...
##  $ Primary.specialty: chr  "CLINICAL NURSE SPECIALIST" "PHYSICAL MEDICINE AND REHABILITATION" "CLINICAL PSYCHOLOGIST" "OBSTETRICS/GYNECOLOGY" ...
##  $ City             : chr  "CONYERS" "LOS ANGELES" "PORTLAND" "WALDORF" ...
##  $ State            : chr  "GA" "CA" "ME" "MD" ...
##  $ Zip.Code         : chr  "300945951" "900485215" "041014075" "206034805" ...
head(doctors)
##          NPI Graduation.year                    Primary.specialty
## 1 1003001843            1998            CLINICAL NURSE SPECIALIST
## 2 1891805826            1992 PHYSICAL MEDICINE AND REHABILITATION
## 3 1679797591            1971                CLINICAL PSYCHOLOGIST
## 4 1124225222            2003                OBSTETRICS/GYNECOLOGY
## 5 1225065907            1972                         CHIROPRACTIC
## 6 1972618387            1976                          DERMATOLOGY
##          City State  Zip.Code
## 1     CONYERS    GA 300945951
## 2 LOS ANGELES    CA 900485215
## 3    PORTLAND    ME 041014075
## 4     WALDORF    MD 206034805
## 5      OTTAWA    OH 458751817
## 6       ALTON    IL 620026326
#deal with zip code format 
doctors$Zip.Code<- as.numeric(doctors$Zip.Code)
doctors$set<-ifelse(doctors$Zip.Code > 99999,1, 2)

#separate data by 9-digit and 5-digit formats
data <- split(doctors, doctors$set)
temp1<-data[[1]] #data with 9-digit zip Codes

#add leading zeros so all data has 9 digits: 1234567 --> 001234567
# take first 5 digits, then remove 0s: 00123 --> 123
library(stringr)
temp1$Zip.Code <-str_pad(temp1$Zip.Code, 9, pad = "0")  
temp1$Zip.Code <-substr(temp1$Zip.Code,0,5)
temp1$Zip.Code <-substr(temp1$Zip.Code,regexpr("[^0]",temp1$Zip.Code),nchar(temp1$Zip.Code))

#recombine dataset separated by zip code
doctors <- rbind(temp1, data[[2]]) 
doctors$set <- NULL #remove dummy variable

#data cleaning & renaming 
doctors[2]<-2016 - doctors$Graduation.year
names(doctors)[2] <- "experience"
names(doctors)[6] <- "zip_code"
head(doctors)
##          NPI experience                    Primary.specialty        City
## 1 1003001843         18            CLINICAL NURSE SPECIALIST     CONYERS
## 2 1891805826         24 PHYSICAL MEDICINE AND REHABILITATION LOS ANGELES
## 3 1679797591         45                CLINICAL PSYCHOLOGIST    PORTLAND
## 4 1124225222         13                OBSTETRICS/GYNECOLOGY     WALDORF
## 5 1225065907         44                         CHIROPRACTIC      OTTAWA
## 6 1972618387         40                          DERMATOLOGY       ALTON
##   State zip_code
## 1    GA    30094
## 2    CA    90048
## 3    ME     4101
## 4    MD    20603
## 5    OH    45875
## 6    IL    62002
#combine geography information. Fill in zips w/o doctors w/ 0
geography <-read.csv('./data/geographyGood.csv', stringsAsFactors = FALSE)
#myVars <- c("CountyFIPSCode","zip_code")
#geography <- geography[myVars]
names(geography)[2] <- "CountyFIPSCode"

doctors <- merge(x=doctors, y=geography, by="zip_code")
doctors[is.na(doctors)] <- 0

doctors<- unique(doctors)

library(plyr)
#create dataset for number of doctors in each county
myVars <- c("NPI","CountyFIPSCode")
docNum <- doctors[myVars]
docNum <- ddply(docNum, .(CountyFIPSCode), mutate, count = length(unique(NPI)))
docNum$NPI<- NULL
docNum <- unique((docNum))
docNum <- docNum[complete.cases(docNum),]

#create dataset for average experience of doctors in each county
myVars <- c("experience","CountyFIPSCode")
yearDoc <- doctors[myVars]
yearDoc<- aggregate(yearDoc,by=list(yearDoc$CountyFIPSCode), FUN=mean, na.rm=TRUE)
yearDoc$Group.1 <-NULL

#join on county code
docNum <- merge(x=docNum, y=yearDoc, by="CountyFIPSCode", all.x = TRUE )

#create dataset for number of specialties in each county
myVars <- c("Primary.specialty","CountyFIPSCode")
docSpec <- doctors[myVars]
docSpec <- ddply(docSpec, .(CountyFIPSCode), mutate, count = length(unique(Primary.specialty)))
docSpec$Primary.specialty <- NULL
docSpec <- unique((docSpec))
names(docSpec)[2]<- "spec_count"

docNum <- merge(x=docNum, y=docSpec, by="CountyFIPSCode", all.x = TRUE )
#docNum$count<-ifelse(docNum$experience == 0,0, docNum$count)
#docNum$spec_count<-ifelse(docNum$experience == 0,0, docNum$spec_count)

geography <- merge(x = docNum, y = geography, all.y = TRUE)
geography[is.na(geography)] <- 0

docNum <- geography
docNum$StateCode <- docNum$StateName <- docNum$CountyName <- NULL

rm(docSpec, doctors, yearDoc, data)

temp <- docNum$zip_code
temp <- unique(temp)

n_occur <- data.frame(table(docNum$zip_code))
n_occur[n_occur$Freq > 1,]
## [1] Var1 Freq
## <0 rows> (or 0-length row.names)
dupes <- docNum[docNum$zip_code %in% n_occur$Var1[n_occur$Freq > 1],]

#write.csv(docNum, file = "./data/docByZip2.csv",row.names=FALSE)
physician <- read.csv(file = "./data/docByZip2.csv", stringsAsFactors = FALSE)
str(physician)
## 'data.frame':    41184 obs. of  5 variables:
##  $ CountyFIPSCode: int  1001 1001 1001 1003 1003 1003 1003 1003 1003 1003 ...
##  $ count         : int  0 0 0 154 154 154 154 154 154 154 ...
##  $ experience    : num  0 0 0 22.4 22.4 ...
##  $ spec_count    : int  0 0 0 29 29 29 29 29 29 29 ...
##  $ zip_code      : int  36008 36068 36003 36559 36579 36535 36550 36551 36547 36526 ...
summary(physician)
##  CountyFIPSCode      count          experience      spec_count   
##  Min.   : 1001   Min.   :   0.0   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.:17149   1st Qu.:  13.0   1st Qu.:19.34   1st Qu.: 6.00  
##  Median :29199   Median :  66.0   Median :21.91   Median :19.00  
##  Mean   :29626   Mean   : 310.1   Mean   :20.58   Mean   :24.16  
##  3rd Qu.:42069   3rd Qu.: 300.0   3rd Qu.:23.94   3rd Qu.:41.00  
##  Max.   :78030   Max.   :4846.0   Max.   :64.00   Max.   :69.00  
##     zip_code    
##  Min.   :  501  
##  1st Qu.:26137  
##  Median :48856  
##  Mean   :49596  
##  3rd Qu.:72950  
##  Max.   :99950

External Features Combination: by zip code

unemploy_Income <- merge(unemployment_data2, income_data2, all = TRUE, by.x = "zipcode", by.y = "ZIPCODE")
unemploy_Income_0na <- na.omit(unemploy_Income)
unemploy_Income_0na$average_income <- unemploy_Income_0na$total_income / unemploy_Income_0na$polulation
phy_unemploy_income <- merge(physician, unemploy_Income_0na, all = TRUE, by.x = "zip_code", by.y = "zipcode")
phy_unemploy_income_0na <- na.omit(phy_unemploy_income)
phy_unemploy_income_0na$average_physician = round(phy_unemploy_income_0na$polulation / (phy_unemploy_income_0na$count + 1))
str(phy_unemploy_income_0na)
## 'data.frame':    25376 obs. of  10 variables:
##  $ zip_code         : int  1001 1002 1005 1008 1010 1011 1012 1020 1022 1026 ...
##  $ CountyFIPSCode   : int  25013 25015 25027 25013 25013 25015 25015 25013 25013 25015 ...
##  $ count            : int  217 78 401 217 217 78 78 217 217 78 ...
##  $ experience       : num  26.6 25.2 23.6 26.6 26.6 ...
##  $ spec_count       : int  43 26 41 43 43 26 26 43 43 26 ...
##  $ polulation       : num  16576 36794 5077 1235 3348 ...
##  $ unemployment_rate: num  3.97 7.95 5.68 3.76 2.95 ...
##  $ total_income     : num  477601 762298 129645 38867 132147 ...
##  $ average_income   : num  28.8 20.7 25.5 31.5 39.5 ...
##  $ average_physician: num  76 466 13 6 15 26 7 134 11 13 ...
##  - attr(*, "na.action")=Class 'omit'  Named int [1:15808] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr [1:15808] "1" "2" "3" "4" ...
#write.csv(phy_unemploy_income_0na, file = "./external_data_v2.csv")

Combine Internal and External Data

final_data <- merge(phy_unemploy_income_0na, cost_rating_zip2, all = TRUE, by.x = "zip_code", by.y = "Zip_Code")
final_data <- na.omit(final_data)
colnames <- c('zip_code', 'experience', 'spec_count', 'unemployment_rate', 'average_income', 'average_physician', 'contract_id', 'tier_label', 'mnthly_prm', 'ann_ddctbl', 'copay_max', 'coin_max', 'Star_Rating_Current')
final_data2 <- final_data[, colnames]
str(final_data2)
## 'data.frame':    132083 obs. of  13 variables:
##  $ zip_code           : int  1001 1001 1001 1001 1001 1001 1002 1002 1002 1002 ...
##  $ experience         : num  26.6 26.6 26.6 26.6 26.6 ...
##  $ spec_count         : int  43 43 43 43 43 43 26 26 26 26 ...
##  $ unemployment_rate  : num  3.97 3.97 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 ...
##  $ average_income     : num  28.8 28.8 28.8 28.8 28.8 ...
##  $ average_physician  : num  76 76 76 76 76 76 466 466 466 466 ...
##  $ contract_id        : chr  "H2261" "H8578" "H9001" "H2230" ...
##  $ tier_label         : int  5 4 5 5 5 5 5 5 4 5 ...
##  $ mnthly_prm         : num  28.5 30.6 21.5 26 17.6 ...
##  $ ann_ddctbl         : num  130 11.5 76.8 168 35.7 ...
##  $ copay_max          : num  0 0 76 0 0 0 0 0 0 0 ...
##  $ coin_max           : num  0 0 33 0 25 25 0 25 0 0 ...
##  $ Star_Rating_Current: chr  "4.5" "4" "4.5" "4.5" ...
tail(final_data2)
##        zip_code experience spec_count unemployment_rate average_income
## 228324    99362    22.5082         25              9.21       25.95161
## 228325    99362    22.5082         25              9.21       25.95161
## 228326    99362    22.5082         25              9.21       25.95161
## 228330    99371    29.0000          1              8.54       14.71838
## 228331    99401     0.0000          0              8.21       25.87783
## 228332    99402     0.0000          0              7.27       34.88941
##        average_physician contract_id tier_label mnthly_prm ann_ddctbl
## 228324               613       H5826          3   15.40000    64.0000
## 228325               613       H3813          5   38.00000    24.0000
## 228326               613       H6609          5   26.64634   209.5122
## 228330               210       H5826          3   15.40000    64.0000
## 228331               221       H1304          5   34.87500    53.1250
## 228332              1709       H1304          5   34.87500    53.1250
##        copay_max coin_max Star_Rating_Current
## 228324         0       25                   3
## 228325         0        0                 3.5
## 228326        90       28                   4
## 228330         0       25                   3
## 228331         0        0                   4
## 228332         0        0                   4
#write.csv(final_data2, file = "./data/final_data_v2.csv", row.names = FALSE)

Combine Geographic Info

geo <- read.csv(file = "./data/zip_codes_states.csv", stringsAsFactors = FALSE)
final_data3 <- merge(final_data2, geo, all.x = TRUE, by.x = "zip_code", by.y = "zip_code")
str(final_data3)
## 'data.frame':    132083 obs. of  18 variables:
##  $ zip_code           : int  1001 1001 1001 1001 1001 1001 1002 1002 1002 1002 ...
##  $ experience         : num  26.6 26.6 26.6 26.6 26.6 ...
##  $ spec_count         : int  43 43 43 43 43 43 26 26 26 26 ...
##  $ unemployment_rate  : num  3.97 3.97 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 ...
##  $ average_income     : num  28.8 28.8 28.8 28.8 28.8 ...
##  $ average_physician  : num  76 76 76 76 76 76 466 466 466 466 ...
##  $ contract_id        : chr  "H2261" "H8578" "H9001" "H2230" ...
##  $ tier_label         : int  5 4 5 5 5 5 5 5 4 5 ...
##  $ mnthly_prm         : num  28.5 30.6 21.5 26 17.6 ...
##  $ ann_ddctbl         : num  130 11.5 76.8 168 35.7 ...
##  $ copay_max          : num  0 0 76 0 0 0 0 0 0 0 ...
##  $ coin_max           : num  0 0 33 0 25 25 0 25 0 0 ...
##  $ Star_Rating_Current: chr  "4.5" "4" "4.5" "4.5" ...
##  $ latitude           : num  42.1 42.1 42.1 42.1 42.1 ...
##  $ longitude          : num  -72.8 -72.8 -72.8 -72.8 -72.8 ...
##  $ city               : chr  "Agawam" "Agawam" "Agawam" "Agawam" ...
##  $ state              : chr  "MA" "MA" "MA" "MA" ...
##  $ county             : chr  "Hampden" "Hampden" "Hampden" "Hampden" ...
#final_data3 <- unique(final_data3)
#write.csv(final_data3, file = "./data/final_data_v3.csv", row.names = FALSE)

Exploratory Plotting

library(ggplot2)
library(gridExtra)
summary(final_data3)
##     zip_code       experience      spec_count    unemployment_rate
##  Min.   : 1001   Min.   : 0.00   Min.   : 0.00   Min.   :  0.000  
##  1st Qu.:19310   1st Qu.:20.25   1st Qu.:11.00   1st Qu.:  3.100  
##  Median :45315   Median :22.55   Median :31.00   Median :  4.480  
##  Mean   :46907   Mean   :21.63   Mean   :31.35   Mean   :  5.553  
##  3rd Qu.:72030   3rd Qu.:24.22   3rd Qu.:50.00   3rd Qu.:  6.750  
##  Max.   :99402   Max.   :64.00   Max.   :69.00   Max.   :100.000  
##  average_income     average_physician contract_id          tier_label   
##  Min.   :   0.197   Min.   :    0.0   Length:132083      Min.   :2.000  
##  1st Qu.:  18.283   1st Qu.:   18.0   Class :character   1st Qu.:5.000  
##  Median :  24.860   Median :   46.0   Mode  :character   Median :5.000  
##  Mean   :  38.924   Mean   :  189.4                      Mean   :5.105  
##  3rd Qu.:  37.109   3rd Qu.:  140.0                      3rd Qu.:5.000  
##  Max.   :7395.489   Max.   :25198.0                      Max.   :7.000  
##    mnthly_prm      ann_ddctbl       copay_max       coin_max    
##  Min.   : 0.00   Min.   :  0.00   Min.   : 0.0   Min.   : 0.00  
##  1st Qu.:10.50   1st Qu.:  0.00   1st Qu.: 0.0   1st Qu.: 0.00  
##  Median :21.52   Median : 94.12   Median : 0.0   Median :25.00  
##  Mean   :19.95   Mean   :108.20   Mean   :27.6   Mean   :19.91  
##  3rd Qu.:26.65   3rd Qu.:161.50   3rd Qu.:80.0   3rd Qu.:33.00  
##  Max.   :68.71   Max.   :320.00   Max.   :95.0   Max.   :50.00  
##  Star_Rating_Current    latitude       longitude           city          
##  Length:132083       Min.   :19.27   Min.   :-168.02   Length:132083     
##  Class :character    1st Qu.:34.11   1st Qu.: -95.43   Class :character  
##  Mode  :character    Median :39.94   Median : -85.42   Mode  :character  
##                      Mean   :38.31   Mean   : -89.33                     
##                      3rd Qu.:41.99   3rd Qu.: -78.84                     
##                      Max.   :48.95   Max.   : -67.04                     
##     state              county         
##  Length:132083      Length:132083     
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
plotdata <- final_data3
# external feature plotting
ggplot(data = plotdata, aes(factor(Star_Rating_Current), experience)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Average Experience Years") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), spec_count)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Specialty Counts") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), unemployment_rate)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Unemployment Rate(%)") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), average_income)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Average Income(,000$)") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), average_physician)) +
    #theme(legend.position="bottom", legend.title=element_blank()) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Average Citizens per Physician") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

# external feature plotting
ggplot(data = plotdata, aes(factor(Star_Rating_Current), tier_label)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Tiers") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), mnthly_prm)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Monthly Premium") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), ann_ddctbl)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Annualy Deductible") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), copay_max)) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Maxium Copayment") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current), coin_max)) +
    #theme(legend.position="bottom", legend.title=element_blank()) +
    guides(fill=FALSE) +
    labs(x="Rating Scores", y="Maxium Coinsurance") +
    geom_boxplot(aes(fill = factor(Star_Rating_Current)))

ggplot(data = plotdata, aes(factor(Star_Rating_Current))) +
    guides(fill=FALSE) +
    geom_bar(aes(fill = factor(Star_Rating_Current)))

# explore the clustering of

Split the dataset for modeling

# here we have two datasets for modeling: first without removing 4 point, second with removing 4 points
final_data3 <- read.csv(file = './data/final_data_v3.csv')
# remove neutral 4 points, and change the label into 1 and 0
final_data3_0rm4 <- final_data3
final_data3_rm4 <- final_data3[final_data3$Star_Rating_Current != 4, ]
# transfer the label into binary value
final_data3_0rm4$sentiment <- as.factor(ifelse(final_data3_0rm4$Star_Rating_Current > 4,1,0))
final_data3_rm4$sentiment <- as.factor(ifelse(final_data3_rm4$Star_Rating_Current > 4,1,0))
# subset the need columns for modeling
colnames <- c('experience', 'spec_count', 'unemployment_rate', 'average_income', 'average_physician', 'tier_label', 'mnthly_prm', 'ann_ddctbl', 'copay_max', 'coin_max', 'sentiment')
final_data3_0rm4 <- final_data3_0rm4[, colnames]
final_data3_rm4 <- final_data3_rm4[, colnames]
str(final_data3_0rm4)
## 'data.frame':    132083 obs. of  11 variables:
##  $ experience       : num  26.6 26.6 26.6 26.6 26.6 ...
##  $ spec_count       : int  43 43 43 43 43 43 26 26 26 26 ...
##  $ unemployment_rate: num  3.97 3.97 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 ...
##  $ average_income   : num  28.8 28.8 28.8 28.8 28.8 ...
##  $ average_physician: int  76 76 76 76 76 76 466 466 466 466 ...
##  $ tier_label       : int  5 4 5 5 5 5 5 5 4 5 ...
##  $ mnthly_prm       : num  28.5 30.6 21.5 26 17.6 ...
##  $ ann_ddctbl       : num  130 11.5 76.8 168 35.7 ...
##  $ copay_max        : int  0 0 76 0 0 0 0 0 0 0 ...
##  $ coin_max         : int  0 0 33 0 25 25 0 25 0 0 ...
##  $ sentiment        : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 2 2 1 2 ...
str(final_data3_rm4)
## 'data.frame':    75506 obs. of  11 variables:
##  $ experience       : num  26.6 26.6 26.6 26.6 25.2 ...
##  $ spec_count       : int  43 43 43 43 26 26 26 26 41 41 ...
##  $ unemployment_rate: num  3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 5.68 5.68 ...
##  $ average_income   : num  28.8 28.8 28.8 28.8 20.7 ...
##  $ average_physician: int  76 76 76 76 466 466 466 466 13 13 ...
##  $ tier_label       : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ mnthly_prm       : num  28.5 21.5 26 17.6 28.5 ...
##  $ ann_ddctbl       : num  130 76.8 168 35.7 130 ...
##  $ copay_max        : int  0 76 0 0 0 0 0 76 0 76 ...
##  $ coin_max         : int  0 33 0 25 0 25 0 33 0 33 ...
##  $ sentiment        : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# subset the train and test data sets
set.seed(510)
# for data set without removing 4
sample_size_0rm4 <- floor(0.8 * nrow(final_data3_0rm4))
training_index_0rm4 <- sample(seq_len(nrow(final_data3_0rm4)), size = sample_size_0rm4)
train_0rm4 <- final_data3_0rm4[training_index_0rm4,]
test_0rm4 <- final_data3_0rm4[-training_index_0rm4,]
# for data set removing 4
set.seed(510)
sample_size_rm4 <- floor(0.8 * nrow(final_data3_rm4))
training_index_rm4 <- sample(seq_len(nrow(final_data3_rm4)), size = sample_size_rm4)
train_rm4 <- final_data3_rm4[training_index_rm4,]
test_rm4 <- final_data3_rm4[-training_index_rm4,]

Logistic Regression

# train the model without removing 4
logsitic_regression_model <- glm(formula = sentiment ~ ., family = binomial(link = 'logit'), data = train_0rm4)
summary(logsitic_regression_model)
## 
## Call:
## glm(formula = sentiment ~ ., family = binomial(link = "logit"), 
##     data = train_0rm4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5654  -0.7071  -0.4463  -0.1581   4.1043  
## 
## Coefficients:
##                     Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)       -2.504e+00  1.048e-01  -23.895   <2e-16 ***
## experience         8.168e-05  2.106e-03    0.039    0.969    
## spec_count        -1.031e-02  4.869e-04  -21.173   <2e-16 ***
## unemployment_rate -3.032e-02  2.222e-03  -13.647   <2e-16 ***
## average_income    -3.063e-05  7.104e-05   -0.431    0.666    
## average_physician -3.390e-04  2.562e-05  -13.233   <2e-16 ***
## tier_label         4.806e-01  1.790e-02   26.851   <2e-16 ***
## mnthly_prm         1.089e-02  6.863e-04   15.874   <2e-16 ***
## ann_ddctbl        -1.250e-02  1.249e-04 -100.064   <2e-16 ***
## copay_max         -1.395e-02  3.230e-04  -43.185   <2e-16 ***
## coin_max           1.806e-02  6.970e-04   25.914   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 108827  on 105665  degrees of freedom
## Residual deviance:  91270  on 105655  degrees of freedom
## AIC: 91292
## 
## Number of Fisher Scoring iterations: 5
# evaluate the model
library(caret)
lr_pred <- predict(logsitic_regression_model, newdata = test_0rm4, type = "response")
lr_pred <- ifelse(lr_pred >= 0.5, 1, 0)
# evaluation <- cbind(test, pred)
# evaluation$correct <- ifelse(evaluation$sentiment == evaluation$pred,1,0)
# sum(evaluation$correct) / nrow(evaluation)
confusionMatrix(data = lr_pred, test_0rm4[, 'sentiment'])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 20450  4723
##          1   446   798
##                                           
##                Accuracy : 0.8043          
##                  95% CI : (0.7995, 0.8091)
##     No Information Rate : 0.791           
##     P-Value [Acc > NIR] : 4.154e-08       
##                                           
##                   Kappa : 0.1723          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9787          
##             Specificity : 0.1445          
##          Pos Pred Value : 0.8124          
##          Neg Pred Value : 0.6415          
##              Prevalence : 0.7910          
##          Detection Rate : 0.7741          
##    Detection Prevalence : 0.9529          
##       Balanced Accuracy : 0.5616          
##                                           
##        'Positive' Class : 0               
## 
# train the model removing 4
logsitic_regression_model <- glm(formula = sentiment ~ ., family = binomial(link = 'logit'), data = train_rm4)
summary(logsitic_regression_model)
## 
## Call:
## glm(formula = sentiment ~ ., family = binomial(link = "logit"), 
##     data = train_rm4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1455  -0.8498  -0.3612   0.9150   4.3925  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -3.016e+00  1.168e-01 -25.812   <2e-16 ***
## experience         2.207e-03  2.547e-03   0.867   0.3862    
## spec_count        -1.803e-02  5.695e-04 -31.665   <2e-16 ***
## unemployment_rate -3.620e-02  2.568e-03 -14.096   <2e-16 ***
## average_income    -1.650e-04  8.055e-05  -2.049   0.0405 *  
## average_physician -4.264e-04  3.048e-05 -13.988   <2e-16 ***
## tier_label         7.302e-01  2.043e-02  35.734   <2e-16 ***
## mnthly_prm         2.989e-02  8.587e-04  34.808   <2e-16 ***
## ann_ddctbl        -1.118e-02  1.334e-04 -83.809   <2e-16 ***
## copay_max         -1.866e-02  3.794e-04 -49.179   <2e-16 ***
## coin_max           1.792e-02  7.776e-04  23.051   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 79404  on 60403  degrees of freedom
## Residual deviance: 61649  on 60393  degrees of freedom
## AIC: 61671
## 
## Number of Fisher Scoring iterations: 5
# evaluate the model
library(caret)
lr_pred <- predict(logsitic_regression_model, newdata = test_rm4, type = "response")
lr_pred <- ifelse(lr_pred >= 0.5, 1, 0)
# evaluation <- cbind(test, pred)
# evaluation$correct <- ifelse(evaluation$sentiment == evaluation$pred,1,0)
# sum(evaluation$correct) / nrow(evaluation)
confusionMatrix(data = lr_pred, test_rm4[, 'sentiment'])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7932 2117
##          1 1540 3513
##                                           
##                Accuracy : 0.7578          
##                  95% CI : (0.7509, 0.7647)
##     No Information Rate : 0.6272          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4712          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8374          
##             Specificity : 0.6240          
##          Pos Pred Value : 0.7893          
##          Neg Pred Value : 0.6952          
##              Prevalence : 0.6272          
##          Detection Rate : 0.5252          
##    Detection Prevalence : 0.6654          
##       Balanced Accuracy : 0.7307          
##                                           
##        'Positive' Class : 0               
## 

Neural Network

library (nnet)
# train the model without removing 4
neural_network_model <- nnet(formula =  sentiment ~ ., data =train_0rm4, decay = 0.5, size = 6)
## # weights:  73
## initial  value 53644.729157 
## iter  10 value 47637.208736
## iter  20 value 46524.108951
## iter  30 value 44920.621501
## iter  40 value 43875.582198
## iter  50 value 43175.307769
## iter  60 value 42339.380476
## iter  70 value 41774.526227
## iter  80 value 41074.695671
## iter  90 value 40310.029867
## iter 100 value 39763.241093
## final  value 39763.241093 
## stopped after 100 iterations
# summary(neural_network_model)
# evaluate the model
nn_pred <- predict(neural_network_model, newdata = test_0rm4)
nn_pred <- ifelse(nn_pred >= 0.5, 1, 0)
# confusion matrix
confusionMatrix(data = nn_pred, test_0rm4[, 'sentiment'])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 20383  3719
##          1   513  1802
##                                           
##                Accuracy : 0.8398          
##                  95% CI : (0.8353, 0.8442)
##     No Information Rate : 0.791           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3838          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9754          
##             Specificity : 0.3264          
##          Pos Pred Value : 0.8457          
##          Neg Pred Value : 0.7784          
##              Prevalence : 0.7910          
##          Detection Rate : 0.7716          
##    Detection Prevalence : 0.9124          
##       Balanced Accuracy : 0.6509          
##                                           
##        'Positive' Class : 0               
## 
# train the model removing 4
neural_network_model <- nnet(formula =  sentiment ~ ., data =train_rm4, decay = 0.5, size = 6)
## # weights:  73
## initial  value 42403.397320 
## iter  10 value 34878.621142
## iter  20 value 33571.315969
## iter  30 value 32112.158414
## iter  40 value 30900.358568
## iter  50 value 29992.900287
## iter  60 value 29429.434779
## iter  70 value 29219.767765
## iter  80 value 28883.006736
## iter  90 value 28732.590234
## iter 100 value 28649.498357
## final  value 28649.498357 
## stopped after 100 iterations
# summary(neural_network_model)
# evaluate the model
nn_pred <- predict(neural_network_model, newdata = test_rm4)
nn_pred <- ifelse(nn_pred >= 0.5, 1, 0)
# confusion matrix
confusionMatrix(data = nn_pred, test_rm4[, 'sentiment'])
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 8470 2375
##          1 1002 3255
##                                          
##                Accuracy : 0.7764         
##                  95% CI : (0.7697, 0.783)
##     No Information Rate : 0.6272         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4969         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.8942         
##             Specificity : 0.5782         
##          Pos Pred Value : 0.7810         
##          Neg Pred Value : 0.7646         
##              Prevalence : 0.6272         
##          Detection Rate : 0.5609         
##    Detection Prevalence : 0.7181         
##       Balanced Accuracy : 0.7362         
##                                          
##        'Positive' Class : 0              
##