Medicare.gov released recent two years’s data about “Drug and Health Plan” coverage around the country. The data inclues who, when, where, and how the plans were used. There are many plans with low ratings. And we want to find out how predictors such as political affiliation, income level, insurance rates, unemployment rates, hospital type, or happiness index contribute to the final rating of one specific type of plan. This is where we start to build the predicition modeling. It could be taken for reference for future plan upgrading or new plan release.
| Algorithms | Rationale |
|---|---|
| Logistic Regession, Naive Bayes, Decision Tree, ANN | Since the project is one supervised problem, we are going to apply multiple prediction models to the datasets. And we will choose the final one with the highest accuracy. |
drug_cost <- read.csv("./data/2015Med2000_flatfiles/vwPlanDrugsCostSharing.csv", stringsAsFactors = FALSE, na.strings = c("Information Not Available", ""))
# data points before preprocessing
str(drug_cost)
## 'data.frame': 5227 obs. of 28 variables:
## $ contract_id : chr "90091" "E0654" "E0654" "E0654" ...
## $ segment_id : int 0 0 0 0 0 0 0 0 0 0 ...
## $ plan_id : int 999 801 802 803 801 801 801 801 801 1 ...
## $ contract_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ language_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ plan_cvrg_typ_id : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mnthly_prm : num 0 33 33 33 33 33 33 33 33 0 ...
## $ ann_ddctbl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ icl_lmt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ thrshld : num 0 0 0 0 0 0 0 0 0 0 ...
## $ copay_amt_inn_prefrd : chr NA NA NA NA ...
## $ copay_amt_inn_prefrd_lvl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ coins_amt_inn_prefrd : chr NA NA NA NA ...
## $ coins_amt_inn_prefrd_lvl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copay_amt_inn_general : chr NA NA NA NA ...
## $ copay_amt_inn_general_lvl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ coins_amt_inn_general : chr NA NA NA NA ...
## $ coins_amt_inn_general_lvl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copay_amt_mail_ordr_prefrd : chr NA NA NA NA ...
## $ copay_amt_mail_ordr_prefrd_lvl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ coins_amt_mail_ordr_prefrd : chr NA NA NA NA ...
## $ coins_amt_mail_ordr_prefrd_lvl : int 0 0 0 0 0 0 0 0 0 0 ...
## $ copay_amt_mail_ordr_general : chr NA NA NA NA ...
## $ copay_amt_mail_ordr_general_lvl: int 0 0 0 0 0 0 0 0 0 0 ...
## $ coins_amt_mail_ordr_general : chr NA NA NA NA ...
## $ coins_amt_mail_ordr_general_lvl: int 0 0 0 0 0 0 0 0 0 0 ...
## $ is_mdcr_dfnd_std_yn : chr " " "N" "N" "N" ...
## $ post_thrshld_mdcr_std_cost : int 0 0 0 0 0 0 0 0 0 0 ...
# keep the copay, deductible, and coinsurance
colnames <- c("contract_id", "plan_id", "mnthly_prm", "ann_ddctbl", "copay_amt_inn_prefrd", "coins_amt_inn_prefrd")
drug_cost2 <- drug_cost[,colnames]
drug_cost3 <- drug_cost2
# split the range into standalone characters
drug_cost3$copay_low <- sapply(strsplit(drug_cost2$copay_amt_inn_prefrd, split="-"), '[', 1)
drug_cost3$copay_high <- sapply(strsplit(drug_cost2$copay_amt_inn_prefrd, split="-"), '[', 2)
drug_cost3$coin_low <- sapply(strsplit(drug_cost2$coins_amt_inn_prefrd, split="-"), '[', 1)
drug_cost3$coin_high <- sapply(strsplit(drug_cost2$coins_amt_inn_prefrd, split="-"), '[', 2)
# remove $ and %
library(gdata)
drug_cost3$copay_low <- gsub("^\\$", "", trim(drug_cost3$copay_low))
drug_cost3$copay_high <- gsub("^\\$", "", trim(drug_cost3$copay_high))
drug_cost3$coin_low <- gsub("%$", "", trim(drug_cost3$coin_low))
drug_cost3$coin_high <- gsub("%$", "", trim(drug_cost3$coin_high))
# drug_cost3 <- na.omit(drug_cost3)
#change NA to 0
drug_cost3$copay_low[is.na(drug_cost3$copay_low)] <- 0
drug_cost3$copay_high[is.na(drug_cost3$copay_high)] <- 0
drug_cost3$coin_low[is.na(drug_cost3$coin_low)] <- 0
drug_cost3$coin_high[is.na(drug_cost3$coin_high)] <- 0
# choose the maxium value
drug_cost3$copay_max <- apply(drug_cost3[, c("copay_low", "copay_high")], 1, max)
drug_cost3$coin_max <- apply(drug_cost3[, c("coin_low", "coin_low")], 1, max)
# subset the needed matrix
drug_cost4 <- drug_cost3[, c("contract_id", "mnthly_prm", "ann_ddctbl", "copay_max", "coin_max")]
# change the column type to numeric
drug_cost4$ann_ddctbl <- as.numeric(drug_cost4$ann_ddctbl)
drug_cost4$copay_max <- as.numeric(drug_cost4$copay_max)
drug_cost4$coin_max <- as.numeric(drug_cost4$coin_max)
drug_cost4$mnthly_prm <- as.numeric(drug_cost4$mnthly_prm)
# aggreate the drug_cost by contract id
drug_cost4_aggr1 <- aggregate(drug_cost4[,c("mnthly_prm", "ann_ddctbl")], by = list(drug_cost4$contract_id), FUN = mean)
drug_cost4_aggr2 <- aggregate(drug_cost4[,c("copay_max", "coin_max")], by = list(drug_cost4$contract_id), FUN = max)
drug_cost4_aggr <- merge(drug_cost4_aggr1, drug_cost4_aggr2, all = TRUE, by.x = "Group.1", by.y = "Group.1")
colnames(drug_cost4_aggr) <- c("contract_id", "mnthly_prm", "ann_ddctbl", "copay_max", "coin_max")
#write.csv(drug_cost4_aggr, file = "./data/internal_drugcost.csv")
str(drug_cost4_aggr)
## 'data.frame': 816 obs. of 5 variables:
## $ contract_id: chr "90091" "E0654" "E2630" "E3014" ...
## $ mnthly_prm : num 0 33 33 33 33 33 33 0 0 5.5 ...
## $ ann_ddctbl : num 0 0 0 0 0 0 0 0 0 160 ...
## $ copay_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ coin_max : num 0 0 0 0 0 0 0 0 0 0 ...
tail(drug_cost4_aggr)
## contract_id mnthly_prm ann_ddctbl copay_max coin_max
## 811 S7694 34.86364 261.8182 85 33
## 812 S7950 33.00000 0.0000 0 0
## 813 S8067 67.00000 0.0000 89 33
## 814 S8841 33.00000 0.0000 0 0
## 815 S9579 39.60870 153.0435 0 0
## 816 S9701 33.00000 0.0000 0 0
# load vwPlanDrugTierCost.csv
cost_tier <- read.csv("./data/2015Med2000_flatfiles/vwPlanDrugTierCost.csv", stringsAsFactors = FALSE)
# data points before preprocessing
str(cost_tier)
## 'data.frame': 374040 obs. of 12 variables:
## $ Language : chr "English" "English" "English" "English" ...
## $ segment_id : int 0 0 0 0 0 0 0 0 0 0 ...
## $ contract_id : chr "H0022" "H0022" "H0022" "H0022" ...
## $ plan_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ contract_year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
## $ language_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ tier_level : int 1 1 1 1 1 1 1 1 1 1 ...
## $ tier_label : chr "Generic Drugs" "Generic Drugs" "Generic Drugs" "Generic Drugs" ...
## $ tier_type_desc : chr "Standard Retail Cost-Sharing 30 days" "Standard Mail Order Cost-Sharing gap 60 days" "Standard Mail Order Cost-Sharing 60 days" "Standard Mail Order Cost-Sharing gap 30 days" ...
## $ cost_share_pref : chr "" "" "" "" ...
## $ cost_share_gen : chr "" "" "" "" ...
## $ exception_tier_ind: int 0 0 0 0 0 0 0 0 0 0 ...
# keep contract_id, plan_id, languange_id, tier_lable
colnames <- c("contract_id", "language_id", "tier_label")
cost_tier2 <- cost_tier[, colnames]
cost_tier2 <- cost_tier2[cost_tier2$language_id == "1", ]
cost_tier3 <- cost_tier2[, c("contract_id", "tier_label")]
# count unique tier numbers for each contract id
cost_tier4 <- aggregate(data = cost_tier3, tier_label ~ contract_id, function(x) length(unique(x)))
#write.csv(drug_cost4_aggr, file = "./data/internal_costtier.csv")
str(cost_tier4)
## 'data.frame': 585 obs. of 2 variables:
## $ contract_id: chr "H0022" "H0028" "H0084" "H0104" ...
## $ tier_label : int 3 5 5 5 5 5 4 4 4 3 ...
tail(cost_tier4)
## contract_id tier_label
## 580 S6506 5
## 581 S7230 5
## 582 S7610 5
## 583 S7694 5
## 584 S8067 5
## 585 S9579 5
# load rating data
ratingdata <- read.csv("./data/2015StarRatings_flatfiles/vwStarRating_SummaryScores.csv",na.strings = c("Plan too new to be measured", "Not enough data available"), stringsAsFactors = FALSE)
# data points before preprocessing
str(ratingdata)
## 'data.frame': 3516 obs. of 5 variables:
## $ Contract_ID : chr "E0654" "E0654" "E2630" "E2630" ...
## $ Summary_Score : chr "Resumen de las calificaciones de la calidad de los planes de medicamentos recetados" "Summary Rating of Prescription Drug Plan Quality" "Resumen de las calificaciones de la calidad de los planes de medicamentos recetados" "Summary Rating of Prescription Drug Plan Quality" ...
## $ Star_Rating_Current : chr "3.5 de 5 estrellas" "3.5 out of 5 stars" "4.5 de 5 estrellas" "4.5 out of 5 stars" ...
## $ Star_Rating_Previous: chr "4 de 5 estrellas" "4 out of 5 stars" "4 de 5 estrellas" "4 out of 5 stars" ...
## $ lang_dscrptn : chr "Spanish" "English" "Spanish" "English" ...
# remove Spanish Version
ratingdata2 <- ratingdata[ratingdata$lang_dscrptn=="English",]
# keep the columns of "Contract_ID", "Summary_Score", "Star_Rating_Current"
colnames <- c("Contract_ID", "Summary_Score", "Star_Rating_Current")
ratingdata2 <- ratingdata2[, colnames]
# split the string to get the rating number
ratingdata2$Star_Rating_Current <- sapply(strsplit(ratingdata2$Star_Rating_Current, split=" "), '[', 1)
#ratingdata2$Star_Rating_Previous <- sapply(strsplit(ratingdata2$Star_Rating_Previous, split=" "), '[', 1)
# remove NAs of the current 2015 year
ratingdata2_nocurrna <- ratingdata2[(!is.na(ratingdata2$Star_Rating_Current)), ]
# subset the rating data
ratingdata3 <- ratingdata2_nocurrna[ratingdata2_nocurrna$Summary_Score=="Overall Star Rating", 1:3]
#write.csv(ratingdata3, file = "./data/ratingdata.csv")
str(ratingdata2_nocurrna)
## 'data.frame': 1214 obs. of 3 variables:
## $ Contract_ID : chr "E0654" "E2630" "E3014" "E4744" ...
## $ Summary_Score : chr "Summary Rating of Prescription Drug Plan Quality" "Summary Rating of Prescription Drug Plan Quality" "Summary Rating of Prescription Drug Plan Quality" "Summary Rating of Prescription Drug Plan Quality" ...
## $ Star_Rating_Current: chr "3.5" "4.5" "4.5" "4" ...
tail(ratingdata2_nocurrna)
## Contract_ID Summary_Score
## 3504 S7230 Summary Rating of Prescription Drug Plan Quality
## 3506 S7610 Summary Rating of Prescription Drug Plan Quality
## 3508 S7694 Summary Rating of Prescription Drug Plan Quality
## 3510 S8067 Summary Rating of Prescription Drug Plan Quality
## 3512 S8841 Summary Rating of Prescription Drug Plan Quality
## 3514 S9579 Summary Rating of Prescription Drug Plan Quality
## Star_Rating_Current
## 3504 2.5
## 3506 3
## 3508 3
## 3510 3.5
## 3512 2.5
## 3514 3
#head(ratingdata3)
zip_contract <- read.csv("./data/2015Med2000_flatfiles/vwLocalContractServiceAreas.csv", stringsAsFactors = FALSE)
# data points before preprocessing
str(zip_contract)
## 'data.frame': 419003 obs. of 4 variables:
## $ County_code : int 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
## $ Zip_Code : int 36003 36003 36003 36003 36003 36003 36006 36006 36006 36006 ...
## $ Contract_ID : chr "H0104" "H0150" "H0151" "H0154" ...
## $ Contract_Year: int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
zip_contract2 <- unique(zip_contract[, c("Zip_Code", "Contract_ID")])
#write.csv(zip_contract2, file = "./data/internal_geography.csv")
str(zip_contract2)
## 'data.frame': 364847 obs. of 2 variables:
## $ Zip_Code : int 36003 36003 36003 36003 36003 36003 36006 36006 36006 36006 ...
## $ Contract_ID: chr "H0104" "H0150" "H0151" "H0154" ...
tail(zip_contract2)
## Zip_Code Contract_ID
## 418993 82442 H4652
## 418995 82701 H5435
## 418996 82715 H4652
## 418997 82715 H5435
## 418998 82723 H4652
## 418999 82723 H5435
cost <- merge(cost_tier4, drug_cost4_aggr, all = TRUE, by = "contract_id")
cost_rating <- merge(cost, ratingdata3, all = TRUE, by.x = "contract_id", by.y = "Contract_ID")
cost_rating_zip <- merge(cost_rating, zip_contract2, all = TRUE, by.x = "contract_id", by.y = "Contract_ID")
cost_rating_zip2 <- na.omit(cost_rating_zip)
str(cost_rating_zip2)
## 'data.frame': 227432 obs. of 9 variables:
## $ contract_id : chr "H0028" "H0028" "H0028" "H0028" ...
## $ tier_label : int 5 5 5 5 5 5 5 5 5 5 ...
## $ mnthly_prm : num 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 5.5 ...
## $ ann_ddctbl : num 160 160 160 160 160 160 160 160 160 160 ...
## $ copay_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ coin_max : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Summary_Score : chr "Overall Star Rating" "Overall Star Rating" "Overall Star Rating" "Overall Star Rating" ...
## $ Star_Rating_Current: chr "4" "4" "4" "4" ...
## $ Zip_Code : int 50112 50105 50339 52305 52354 52319 96766 52329 50325 96747 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:137510] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr [1:137510] "1" "2" "3" "4" ...
tail(cost_rating_zip)
## contract_id tier_label mnthly_prm ann_ddctbl copay_max coin_max
## 364937 S7694 5 34.86364 261.8182 85 33
## 364938 S7950 NA 33.00000 0.0000 0 0
## 364939 S8067 5 67.00000 0.0000 89 33
## 364940 S8841 NA 33.00000 0.0000 0 0
## 364941 S9579 5 39.60870 153.0435 0 0
## 364942 S9701 NA 33.00000 0.0000 0 0
## Summary_Score Star_Rating_Current Zip_Code
## 364937 <NA> <NA> NA
## 364938 <NA> <NA> NA
## 364939 <NA> <NA> NA
## 364940 <NA> <NA> NA
## 364941 <NA> <NA> NA
## 364942 <NA> <NA> NA
#write.csv(cost_rating_zip2, file = "./data/internal_data.csv")
unemployment_data <- read.table("./data/unemployment.txt", stringsAsFactors = FALSE)
# data points before preprocessing
str(unemployment_data)
## 'data.frame': 29252 obs. of 8 variables:
## $ V1: chr "1." "2." "3." "20." ...
## $ V2: int 21031 23001 29202 96119 23943 95232 15347 60604 33620 99553 ...
## $ V3: chr "39.488697," "37.292591," "34.023588," "41.042003," ...
## $ V4: num -76.7 -76.4 -81 -120.5 -78.5 ...
## $ V5: chr "," "," "," "," ...
## $ V6: chr "59" "28" "227" "70" ...
## $ V7: num 100 100 100 66.7 47.8 ...
## $ V8: chr "%" "%" "%" "%" ...
unemployment_data2 <- unemployment_data[, c(2, 6, 7)]
colnames(unemployment_data2) <- c("zipcode", "polulation", "unemployment_rate")
unemployment_data2$polulation <- as.numeric(gsub("[[:punct:]]", "", unemployment_data2$polulation))
unemployment_data2$unemployment_rate <- as.numeric(unemployment_data2$unemployment_rate)
#write.csv(cost_rating_zip2, file = "./data/external_unemploy.csv")
str(unemployment_data2)
## 'data.frame': 29252 obs. of 3 variables:
## $ zipcode : int 21031 23001 29202 96119 23943 95232 15347 60604 33620 99553 ...
## $ polulation : num 59 28 227 70 1080 ...
## $ unemployment_rate: num 100 100 100 66.7 47.8 ...
tail(unemployment_data2)
## zipcode polulation unemployment_rate
## 29247 22723 79 0
## 29248 19435 73 0
## 29249 19436 497 0
## 29250 19980 75 0
## 29251 21653 128 0
## 29252 21790 78 0
income_data <- read.csv("./data/zipcode2013/zipcodenoagi13.csv", stringsAsFactors = FALSE)
# data points before preprocessing
# str(income_data)
income_data2 <- income_data[, c("ZIPCODE", "A02650")]
colnames(income_data2)[2] <- "total_income"
#write.csv(income_data2, file = "./data/external_income.csv")
str(income_data2)
## 'data.frame': 27790 obs. of 2 variables:
## $ ZIPCODE : int 0 35004 35005 35006 35007 35010 35014 35016 35019 35020 ...
## $ total_income: num 1.06e+08 2.58e+05 1.29e+05 5.86e+04 6.51e+05 ...
tail(income_data2)
## ZIPCODE total_income
## 27785 83123 21124
## 27786 83126 8319
## 27787 83127 88398
## 27788 83128 63379
## 27789 83414 22319
## 27790 99999 2154270
# creates number of doctors, average years of experience, and number of specializations BY COUNTY
# indexed by each zip code (e.g. 5 zip codes in one county would all have same info)
#read physician data set, select subset, remove non-complete rows
doctors <- read.csv('./data/physician.csv', stringsAsFactors = FALSE)
str(doctors)
## 'data.frame': 268933 obs. of 43 variables:
## $ NPI : int 1003001843 1891805826 1679797591 1124225222 1225065907 1972618387 1316121445 1932372281 1942246673 1750329595 ...
## $ PAC.ID : num 5.60e+09 7.42e+09 4.59e+09 4.46e+08 9.54e+09 ...
## $ Professional.Enrollment.ID : chr "I20050301000433" "I20120727000166" "I20050818000907" "I20071207000432" ...
## $ Last.Name : chr "LUECHINGER" "MATHIS" "MELNICK" "COOKE" ...
## $ First.Name : chr "INGE" "CYNTHIA" "JOSEPH" "ARYIAN" ...
## $ Middle.Name : chr "C" "M" "" "L" ...
## $ Suffix : chr "" "" "" "" ...
## $ Gender : chr "F" "F" "M" "F" ...
## $ Credential : chr "CNS" "" "CP" "" ...
## $ Medical.school.name : chr "OTHER" "HAHNEMANN UNIVERSITY COLLEGE OF MEDICINE" "OTHER" "MEDICAL COLLEGE OF WISCONSIN" ...
## $ Graduation.year : int 1998 1992 1971 2003 1972 1976 2002 2008 1984 2000 ...
## $ Primary.specialty : chr "CLINICAL NURSE SPECIALIST" "PHYSICAL MEDICINE AND REHABILITATION" "CLINICAL PSYCHOLOGIST" "OBSTETRICS/GYNECOLOGY" ...
## $ Secondary.specialty.1 : chr "" "" "" "" ...
## $ Secondary.specialty.2 : chr "" "" "" "" ...
## $ Secondary.specialty.3 : chr "" "" "" "" ...
## $ Secondary.specialty.4 : chr "" "" "" "" ...
## $ All.secondary.specialties : chr "" "" "" "" ...
## $ Organization.legal.name : chr "FAMILY THERAPY PC" "CYNTHIA M. MATHIS, MD, PC" "PORTLAND CENTER FOR COUNSELING AND PSYCHOTHERAPY" "WALDORF WOMENS CARE" ...
## $ Organization.DBA.name : logi NA NA NA NA NA NA ...
## $ Group.Practice.PAC.ID : num 2.26e+09 4.46e+08 2.96e+09 3.68e+09 2.86e+09 ...
## $ Number.of.Group.Practice.members : int 1 1 1 2 1 1 1 3 1 1 ...
## $ Line.1.Street.Address : chr "1296 PARKER SERD" "6221 WILSHIRE BLVD" "178 MIDDLE ST" "11355 PEMBROOKE SQUARE" ...
## $ Line.2.Street.Address : chr "" "" "" "" ...
## $ Marker.of.address.line.2.suppression : chr "N" "Y" "Y" "N" ...
## $ City : chr "CONYERS" "LOS ANGELES" "PORTLAND" "WALDORF" ...
## $ State : chr "GA" "CA" "ME" "MD" ...
## $ Zip.Code : chr "300945951" "900485215" "041014075" "206034805" ...
## $ Claims.based.hospital.affiliation.CCN.1 : int NA 50625 NA 210035 NA NA NA NA NA NA ...
## $ Claims.based.hospital.affiliation.LBN.1 : chr "" "CEDARS-SINAI MEDICAL CENTER" "" "CIVISTA MEDICAL CENTER INC." ...
## $ Claims.based.hospital.affiliation.CCN.2 : chr "" "" "" "" ...
## $ Claims.based.hospital.affiliation.LBN.2 : chr "" "" "" "" ...
## $ Claims.based.hospital.affiliation.CCN.3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Claims.based.hospital.affiliation.LBN.3 : chr "" "" "" "" ...
## $ Claims.based.hospital.affiliation.CCN.4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Claims.based.hospital.affiliation.LBN.4 : chr "" "" "" "" ...
## $ Claims.based.hospital.affiliation.CCN.5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ Claims.based.hospital.affiliation.LBN.5 : chr "" "" "" "" ...
## $ Professional.accepts.Medicare.Assignment : chr "M" "Y" "Y" "Y" ...
## $ Participating.in.eRx : logi NA NA NA NA NA NA ...
## $ Participating.in.PQRS : chr "" "" "" "" ...
## $ Participating.in.EHR : chr "" "" "" "" ...
## $ Received.PQRS.Maintenance.of.Certification.Program.Incentive: chr "" "" "" "" ...
## $ Participated.in.Million.Hearts : chr "" "" "" "" ...
myVars <- c("NPI", "Graduation.year","Primary.specialty","City", "State", "Zip.Code")
doctors <- doctors[myVars]
doctors <- doctors[complete.cases(doctors),]
str(doctors)
## 'data.frame': 266552 obs. of 6 variables:
## $ NPI : int 1003001843 1891805826 1679797591 1124225222 1225065907 1972618387 1316121445 1932372281 1942246673 1750329595 ...
## $ Graduation.year : int 1998 1992 1971 2003 1972 1976 2002 2008 1984 2000 ...
## $ Primary.specialty: chr "CLINICAL NURSE SPECIALIST" "PHYSICAL MEDICINE AND REHABILITATION" "CLINICAL PSYCHOLOGIST" "OBSTETRICS/GYNECOLOGY" ...
## $ City : chr "CONYERS" "LOS ANGELES" "PORTLAND" "WALDORF" ...
## $ State : chr "GA" "CA" "ME" "MD" ...
## $ Zip.Code : chr "300945951" "900485215" "041014075" "206034805" ...
head(doctors)
## NPI Graduation.year Primary.specialty
## 1 1003001843 1998 CLINICAL NURSE SPECIALIST
## 2 1891805826 1992 PHYSICAL MEDICINE AND REHABILITATION
## 3 1679797591 1971 CLINICAL PSYCHOLOGIST
## 4 1124225222 2003 OBSTETRICS/GYNECOLOGY
## 5 1225065907 1972 CHIROPRACTIC
## 6 1972618387 1976 DERMATOLOGY
## City State Zip.Code
## 1 CONYERS GA 300945951
## 2 LOS ANGELES CA 900485215
## 3 PORTLAND ME 041014075
## 4 WALDORF MD 206034805
## 5 OTTAWA OH 458751817
## 6 ALTON IL 620026326
#deal with zip code format
doctors$Zip.Code<- as.numeric(doctors$Zip.Code)
doctors$set<-ifelse(doctors$Zip.Code > 99999,1, 2)
#separate data by 9-digit and 5-digit formats
data <- split(doctors, doctors$set)
temp1<-data[[1]] #data with 9-digit zip Codes
#add leading zeros so all data has 9 digits: 1234567 --> 001234567
# take first 5 digits, then remove 0s: 00123 --> 123
library(stringr)
temp1$Zip.Code <-str_pad(temp1$Zip.Code, 9, pad = "0")
temp1$Zip.Code <-substr(temp1$Zip.Code,0,5)
temp1$Zip.Code <-substr(temp1$Zip.Code,regexpr("[^0]",temp1$Zip.Code),nchar(temp1$Zip.Code))
#recombine dataset separated by zip code
doctors <- rbind(temp1, data[[2]])
doctors$set <- NULL #remove dummy variable
#data cleaning & renaming
doctors[2]<-2016 - doctors$Graduation.year
names(doctors)[2] <- "experience"
names(doctors)[6] <- "zip_code"
head(doctors)
## NPI experience Primary.specialty City
## 1 1003001843 18 CLINICAL NURSE SPECIALIST CONYERS
## 2 1891805826 24 PHYSICAL MEDICINE AND REHABILITATION LOS ANGELES
## 3 1679797591 45 CLINICAL PSYCHOLOGIST PORTLAND
## 4 1124225222 13 OBSTETRICS/GYNECOLOGY WALDORF
## 5 1225065907 44 CHIROPRACTIC OTTAWA
## 6 1972618387 40 DERMATOLOGY ALTON
## State zip_code
## 1 GA 30094
## 2 CA 90048
## 3 ME 4101
## 4 MD 20603
## 5 OH 45875
## 6 IL 62002
#combine geography information. Fill in zips w/o doctors w/ 0
geography <-read.csv('./data/geographyGood.csv', stringsAsFactors = FALSE)
#myVars <- c("CountyFIPSCode","zip_code")
#geography <- geography[myVars]
names(geography)[2] <- "CountyFIPSCode"
doctors <- merge(x=doctors, y=geography, by="zip_code")
doctors[is.na(doctors)] <- 0
doctors<- unique(doctors)
library(plyr)
#create dataset for number of doctors in each county
myVars <- c("NPI","CountyFIPSCode")
docNum <- doctors[myVars]
docNum <- ddply(docNum, .(CountyFIPSCode), mutate, count = length(unique(NPI)))
docNum$NPI<- NULL
docNum <- unique((docNum))
docNum <- docNum[complete.cases(docNum),]
#create dataset for average experience of doctors in each county
myVars <- c("experience","CountyFIPSCode")
yearDoc <- doctors[myVars]
yearDoc<- aggregate(yearDoc,by=list(yearDoc$CountyFIPSCode), FUN=mean, na.rm=TRUE)
yearDoc$Group.1 <-NULL
#join on county code
docNum <- merge(x=docNum, y=yearDoc, by="CountyFIPSCode", all.x = TRUE )
#create dataset for number of specialties in each county
myVars <- c("Primary.specialty","CountyFIPSCode")
docSpec <- doctors[myVars]
docSpec <- ddply(docSpec, .(CountyFIPSCode), mutate, count = length(unique(Primary.specialty)))
docSpec$Primary.specialty <- NULL
docSpec <- unique((docSpec))
names(docSpec)[2]<- "spec_count"
docNum <- merge(x=docNum, y=docSpec, by="CountyFIPSCode", all.x = TRUE )
#docNum$count<-ifelse(docNum$experience == 0,0, docNum$count)
#docNum$spec_count<-ifelse(docNum$experience == 0,0, docNum$spec_count)
geography <- merge(x = docNum, y = geography, all.y = TRUE)
geography[is.na(geography)] <- 0
docNum <- geography
docNum$StateCode <- docNum$StateName <- docNum$CountyName <- NULL
rm(docSpec, doctors, yearDoc, data)
temp <- docNum$zip_code
temp <- unique(temp)
n_occur <- data.frame(table(docNum$zip_code))
n_occur[n_occur$Freq > 1,]
## [1] Var1 Freq
## <0 rows> (or 0-length row.names)
dupes <- docNum[docNum$zip_code %in% n_occur$Var1[n_occur$Freq > 1],]
#write.csv(docNum, file = "./data/docByZip2.csv",row.names=FALSE)
physician <- read.csv(file = "./data/docByZip2.csv", stringsAsFactors = FALSE)
str(physician)
## 'data.frame': 41184 obs. of 5 variables:
## $ CountyFIPSCode: int 1001 1001 1001 1003 1003 1003 1003 1003 1003 1003 ...
## $ count : int 0 0 0 154 154 154 154 154 154 154 ...
## $ experience : num 0 0 0 22.4 22.4 ...
## $ spec_count : int 0 0 0 29 29 29 29 29 29 29 ...
## $ zip_code : int 36008 36068 36003 36559 36579 36535 36550 36551 36547 36526 ...
summary(physician)
## CountyFIPSCode count experience spec_count
## Min. : 1001 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.:17149 1st Qu.: 13.0 1st Qu.:19.34 1st Qu.: 6.00
## Median :29199 Median : 66.0 Median :21.91 Median :19.00
## Mean :29626 Mean : 310.1 Mean :20.58 Mean :24.16
## 3rd Qu.:42069 3rd Qu.: 300.0 3rd Qu.:23.94 3rd Qu.:41.00
## Max. :78030 Max. :4846.0 Max. :64.00 Max. :69.00
## zip_code
## Min. : 501
## 1st Qu.:26137
## Median :48856
## Mean :49596
## 3rd Qu.:72950
## Max. :99950
unemploy_Income <- merge(unemployment_data2, income_data2, all = TRUE, by.x = "zipcode", by.y = "ZIPCODE")
unemploy_Income_0na <- na.omit(unemploy_Income)
unemploy_Income_0na$average_income <- unemploy_Income_0na$total_income / unemploy_Income_0na$polulation
phy_unemploy_income <- merge(physician, unemploy_Income_0na, all = TRUE, by.x = "zip_code", by.y = "zipcode")
phy_unemploy_income_0na <- na.omit(phy_unemploy_income)
phy_unemploy_income_0na$average_physician = round(phy_unemploy_income_0na$polulation / (phy_unemploy_income_0na$count + 1))
str(phy_unemploy_income_0na)
## 'data.frame': 25376 obs. of 10 variables:
## $ zip_code : int 1001 1002 1005 1008 1010 1011 1012 1020 1022 1026 ...
## $ CountyFIPSCode : int 25013 25015 25027 25013 25013 25015 25015 25013 25013 25015 ...
## $ count : int 217 78 401 217 217 78 78 217 217 78 ...
## $ experience : num 26.6 25.2 23.6 26.6 26.6 ...
## $ spec_count : int 43 26 41 43 43 26 26 43 43 26 ...
## $ polulation : num 16576 36794 5077 1235 3348 ...
## $ unemployment_rate: num 3.97 7.95 5.68 3.76 2.95 ...
## $ total_income : num 477601 762298 129645 38867 132147 ...
## $ average_income : num 28.8 20.7 25.5 31.5 39.5 ...
## $ average_physician: num 76 466 13 6 15 26 7 134 11 13 ...
## - attr(*, "na.action")=Class 'omit' Named int [1:15808] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr [1:15808] "1" "2" "3" "4" ...
#write.csv(phy_unemploy_income_0na, file = "./external_data_v2.csv")
final_data <- merge(phy_unemploy_income_0na, cost_rating_zip2, all = TRUE, by.x = "zip_code", by.y = "Zip_Code")
final_data <- na.omit(final_data)
colnames <- c('zip_code', 'experience', 'spec_count', 'unemployment_rate', 'average_income', 'average_physician', 'contract_id', 'tier_label', 'mnthly_prm', 'ann_ddctbl', 'copay_max', 'coin_max', 'Star_Rating_Current')
final_data2 <- final_data[, colnames]
str(final_data2)
## 'data.frame': 132083 obs. of 13 variables:
## $ zip_code : int 1001 1001 1001 1001 1001 1001 1002 1002 1002 1002 ...
## $ experience : num 26.6 26.6 26.6 26.6 26.6 ...
## $ spec_count : int 43 43 43 43 43 43 26 26 26 26 ...
## $ unemployment_rate : num 3.97 3.97 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 ...
## $ average_income : num 28.8 28.8 28.8 28.8 28.8 ...
## $ average_physician : num 76 76 76 76 76 76 466 466 466 466 ...
## $ contract_id : chr "H2261" "H8578" "H9001" "H2230" ...
## $ tier_label : int 5 4 5 5 5 5 5 5 4 5 ...
## $ mnthly_prm : num 28.5 30.6 21.5 26 17.6 ...
## $ ann_ddctbl : num 130 11.5 76.8 168 35.7 ...
## $ copay_max : num 0 0 76 0 0 0 0 0 0 0 ...
## $ coin_max : num 0 0 33 0 25 25 0 25 0 0 ...
## $ Star_Rating_Current: chr "4.5" "4" "4.5" "4.5" ...
tail(final_data2)
## zip_code experience spec_count unemployment_rate average_income
## 228324 99362 22.5082 25 9.21 25.95161
## 228325 99362 22.5082 25 9.21 25.95161
## 228326 99362 22.5082 25 9.21 25.95161
## 228330 99371 29.0000 1 8.54 14.71838
## 228331 99401 0.0000 0 8.21 25.87783
## 228332 99402 0.0000 0 7.27 34.88941
## average_physician contract_id tier_label mnthly_prm ann_ddctbl
## 228324 613 H5826 3 15.40000 64.0000
## 228325 613 H3813 5 38.00000 24.0000
## 228326 613 H6609 5 26.64634 209.5122
## 228330 210 H5826 3 15.40000 64.0000
## 228331 221 H1304 5 34.87500 53.1250
## 228332 1709 H1304 5 34.87500 53.1250
## copay_max coin_max Star_Rating_Current
## 228324 0 25 3
## 228325 0 0 3.5
## 228326 90 28 4
## 228330 0 25 3
## 228331 0 0 4
## 228332 0 0 4
#write.csv(final_data2, file = "./data/final_data_v2.csv", row.names = FALSE)
geo <- read.csv(file = "./data/zip_codes_states.csv", stringsAsFactors = FALSE)
final_data3 <- merge(final_data2, geo, all.x = TRUE, by.x = "zip_code", by.y = "zip_code")
str(final_data3)
## 'data.frame': 132083 obs. of 18 variables:
## $ zip_code : int 1001 1001 1001 1001 1001 1001 1002 1002 1002 1002 ...
## $ experience : num 26.6 26.6 26.6 26.6 26.6 ...
## $ spec_count : int 43 43 43 43 43 43 26 26 26 26 ...
## $ unemployment_rate : num 3.97 3.97 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 ...
## $ average_income : num 28.8 28.8 28.8 28.8 28.8 ...
## $ average_physician : num 76 76 76 76 76 76 466 466 466 466 ...
## $ contract_id : chr "H2261" "H8578" "H9001" "H2230" ...
## $ tier_label : int 5 4 5 5 5 5 5 5 4 5 ...
## $ mnthly_prm : num 28.5 30.6 21.5 26 17.6 ...
## $ ann_ddctbl : num 130 11.5 76.8 168 35.7 ...
## $ copay_max : num 0 0 76 0 0 0 0 0 0 0 ...
## $ coin_max : num 0 0 33 0 25 25 0 25 0 0 ...
## $ Star_Rating_Current: chr "4.5" "4" "4.5" "4.5" ...
## $ latitude : num 42.1 42.1 42.1 42.1 42.1 ...
## $ longitude : num -72.8 -72.8 -72.8 -72.8 -72.8 ...
## $ city : chr "Agawam" "Agawam" "Agawam" "Agawam" ...
## $ state : chr "MA" "MA" "MA" "MA" ...
## $ county : chr "Hampden" "Hampden" "Hampden" "Hampden" ...
#final_data3 <- unique(final_data3)
#write.csv(final_data3, file = "./data/final_data_v3.csv", row.names = FALSE)
library(ggplot2)
library(gridExtra)
summary(final_data3)
## zip_code experience spec_count unemployment_rate
## Min. : 1001 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.:19310 1st Qu.:20.25 1st Qu.:11.00 1st Qu.: 3.100
## Median :45315 Median :22.55 Median :31.00 Median : 4.480
## Mean :46907 Mean :21.63 Mean :31.35 Mean : 5.553
## 3rd Qu.:72030 3rd Qu.:24.22 3rd Qu.:50.00 3rd Qu.: 6.750
## Max. :99402 Max. :64.00 Max. :69.00 Max. :100.000
## average_income average_physician contract_id tier_label
## Min. : 0.197 Min. : 0.0 Length:132083 Min. :2.000
## 1st Qu.: 18.283 1st Qu.: 18.0 Class :character 1st Qu.:5.000
## Median : 24.860 Median : 46.0 Mode :character Median :5.000
## Mean : 38.924 Mean : 189.4 Mean :5.105
## 3rd Qu.: 37.109 3rd Qu.: 140.0 3rd Qu.:5.000
## Max. :7395.489 Max. :25198.0 Max. :7.000
## mnthly_prm ann_ddctbl copay_max coin_max
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.:10.50 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.00
## Median :21.52 Median : 94.12 Median : 0.0 Median :25.00
## Mean :19.95 Mean :108.20 Mean :27.6 Mean :19.91
## 3rd Qu.:26.65 3rd Qu.:161.50 3rd Qu.:80.0 3rd Qu.:33.00
## Max. :68.71 Max. :320.00 Max. :95.0 Max. :50.00
## Star_Rating_Current latitude longitude city
## Length:132083 Min. :19.27 Min. :-168.02 Length:132083
## Class :character 1st Qu.:34.11 1st Qu.: -95.43 Class :character
## Mode :character Median :39.94 Median : -85.42 Mode :character
## Mean :38.31 Mean : -89.33
## 3rd Qu.:41.99 3rd Qu.: -78.84
## Max. :48.95 Max. : -67.04
## state county
## Length:132083 Length:132083
## Class :character Class :character
## Mode :character Mode :character
##
##
##
plotdata <- final_data3
# external feature plotting
ggplot(data = plotdata, aes(factor(Star_Rating_Current), experience)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Average Experience Years") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), spec_count)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Specialty Counts") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), unemployment_rate)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Unemployment Rate(%)") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), average_income)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Average Income(,000$)") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), average_physician)) +
#theme(legend.position="bottom", legend.title=element_blank()) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Average Citizens per Physician") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
# external feature plotting
ggplot(data = plotdata, aes(factor(Star_Rating_Current), tier_label)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Tiers") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), mnthly_prm)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Monthly Premium") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), ann_ddctbl)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Annualy Deductible") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), copay_max)) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Maxium Copayment") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current), coin_max)) +
#theme(legend.position="bottom", legend.title=element_blank()) +
guides(fill=FALSE) +
labs(x="Rating Scores", y="Maxium Coinsurance") +
geom_boxplot(aes(fill = factor(Star_Rating_Current)))
ggplot(data = plotdata, aes(factor(Star_Rating_Current))) +
guides(fill=FALSE) +
geom_bar(aes(fill = factor(Star_Rating_Current)))
# explore the clustering of
# here we have two datasets for modeling: first without removing 4 point, second with removing 4 points
final_data3 <- read.csv(file = './data/final_data_v3.csv')
# remove neutral 4 points, and change the label into 1 and 0
final_data3_0rm4 <- final_data3
final_data3_rm4 <- final_data3[final_data3$Star_Rating_Current != 4, ]
# transfer the label into binary value
final_data3_0rm4$sentiment <- as.factor(ifelse(final_data3_0rm4$Star_Rating_Current > 4,1,0))
final_data3_rm4$sentiment <- as.factor(ifelse(final_data3_rm4$Star_Rating_Current > 4,1,0))
# subset the need columns for modeling
colnames <- c('experience', 'spec_count', 'unemployment_rate', 'average_income', 'average_physician', 'tier_label', 'mnthly_prm', 'ann_ddctbl', 'copay_max', 'coin_max', 'sentiment')
final_data3_0rm4 <- final_data3_0rm4[, colnames]
final_data3_rm4 <- final_data3_rm4[, colnames]
str(final_data3_0rm4)
## 'data.frame': 132083 obs. of 11 variables:
## $ experience : num 26.6 26.6 26.6 26.6 26.6 ...
## $ spec_count : int 43 43 43 43 43 43 26 26 26 26 ...
## $ unemployment_rate: num 3.97 3.97 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 ...
## $ average_income : num 28.8 28.8 28.8 28.8 28.8 ...
## $ average_physician: int 76 76 76 76 76 76 466 466 466 466 ...
## $ tier_label : int 5 4 5 5 5 5 5 5 4 5 ...
## $ mnthly_prm : num 28.5 30.6 21.5 26 17.6 ...
## $ ann_ddctbl : num 130 11.5 76.8 168 35.7 ...
## $ copay_max : int 0 0 76 0 0 0 0 0 0 0 ...
## $ coin_max : int 0 0 33 0 25 25 0 25 0 0 ...
## $ sentiment : Factor w/ 2 levels "0","1": 2 1 2 2 2 1 2 2 1 2 ...
str(final_data3_rm4)
## 'data.frame': 75506 obs. of 11 variables:
## $ experience : num 26.6 26.6 26.6 26.6 25.2 ...
## $ spec_count : int 43 43 43 43 26 26 26 26 41 41 ...
## $ unemployment_rate: num 3.97 3.97 3.97 3.97 7.95 7.95 7.95 7.95 5.68 5.68 ...
## $ average_income : num 28.8 28.8 28.8 28.8 20.7 ...
## $ average_physician: int 76 76 76 76 466 466 466 466 13 13 ...
## $ tier_label : int 5 5 5 5 5 5 5 5 5 5 ...
## $ mnthly_prm : num 28.5 21.5 26 17.6 28.5 ...
## $ ann_ddctbl : num 130 76.8 168 35.7 130 ...
## $ copay_max : int 0 76 0 0 0 0 0 76 0 76 ...
## $ coin_max : int 0 33 0 25 0 25 0 33 0 33 ...
## $ sentiment : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# subset the train and test data sets
set.seed(510)
# for data set without removing 4
sample_size_0rm4 <- floor(0.8 * nrow(final_data3_0rm4))
training_index_0rm4 <- sample(seq_len(nrow(final_data3_0rm4)), size = sample_size_0rm4)
train_0rm4 <- final_data3_0rm4[training_index_0rm4,]
test_0rm4 <- final_data3_0rm4[-training_index_0rm4,]
# for data set removing 4
set.seed(510)
sample_size_rm4 <- floor(0.8 * nrow(final_data3_rm4))
training_index_rm4 <- sample(seq_len(nrow(final_data3_rm4)), size = sample_size_rm4)
train_rm4 <- final_data3_rm4[training_index_rm4,]
test_rm4 <- final_data3_rm4[-training_index_rm4,]
# train the model without removing 4
logsitic_regression_model <- glm(formula = sentiment ~ ., family = binomial(link = 'logit'), data = train_0rm4)
summary(logsitic_regression_model)
##
## Call:
## glm(formula = sentiment ~ ., family = binomial(link = "logit"),
## data = train_0rm4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5654 -0.7071 -0.4463 -0.1581 4.1043
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.504e+00 1.048e-01 -23.895 <2e-16 ***
## experience 8.168e-05 2.106e-03 0.039 0.969
## spec_count -1.031e-02 4.869e-04 -21.173 <2e-16 ***
## unemployment_rate -3.032e-02 2.222e-03 -13.647 <2e-16 ***
## average_income -3.063e-05 7.104e-05 -0.431 0.666
## average_physician -3.390e-04 2.562e-05 -13.233 <2e-16 ***
## tier_label 4.806e-01 1.790e-02 26.851 <2e-16 ***
## mnthly_prm 1.089e-02 6.863e-04 15.874 <2e-16 ***
## ann_ddctbl -1.250e-02 1.249e-04 -100.064 <2e-16 ***
## copay_max -1.395e-02 3.230e-04 -43.185 <2e-16 ***
## coin_max 1.806e-02 6.970e-04 25.914 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 108827 on 105665 degrees of freedom
## Residual deviance: 91270 on 105655 degrees of freedom
## AIC: 91292
##
## Number of Fisher Scoring iterations: 5
# evaluate the model
library(caret)
lr_pred <- predict(logsitic_regression_model, newdata = test_0rm4, type = "response")
lr_pred <- ifelse(lr_pred >= 0.5, 1, 0)
# evaluation <- cbind(test, pred)
# evaluation$correct <- ifelse(evaluation$sentiment == evaluation$pred,1,0)
# sum(evaluation$correct) / nrow(evaluation)
confusionMatrix(data = lr_pred, test_0rm4[, 'sentiment'])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 20450 4723
## 1 446 798
##
## Accuracy : 0.8043
## 95% CI : (0.7995, 0.8091)
## No Information Rate : 0.791
## P-Value [Acc > NIR] : 4.154e-08
##
## Kappa : 0.1723
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9787
## Specificity : 0.1445
## Pos Pred Value : 0.8124
## Neg Pred Value : 0.6415
## Prevalence : 0.7910
## Detection Rate : 0.7741
## Detection Prevalence : 0.9529
## Balanced Accuracy : 0.5616
##
## 'Positive' Class : 0
##
# train the model removing 4
logsitic_regression_model <- glm(formula = sentiment ~ ., family = binomial(link = 'logit'), data = train_rm4)
summary(logsitic_regression_model)
##
## Call:
## glm(formula = sentiment ~ ., family = binomial(link = "logit"),
## data = train_rm4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1455 -0.8498 -0.3612 0.9150 4.3925
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.016e+00 1.168e-01 -25.812 <2e-16 ***
## experience 2.207e-03 2.547e-03 0.867 0.3862
## spec_count -1.803e-02 5.695e-04 -31.665 <2e-16 ***
## unemployment_rate -3.620e-02 2.568e-03 -14.096 <2e-16 ***
## average_income -1.650e-04 8.055e-05 -2.049 0.0405 *
## average_physician -4.264e-04 3.048e-05 -13.988 <2e-16 ***
## tier_label 7.302e-01 2.043e-02 35.734 <2e-16 ***
## mnthly_prm 2.989e-02 8.587e-04 34.808 <2e-16 ***
## ann_ddctbl -1.118e-02 1.334e-04 -83.809 <2e-16 ***
## copay_max -1.866e-02 3.794e-04 -49.179 <2e-16 ***
## coin_max 1.792e-02 7.776e-04 23.051 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 79404 on 60403 degrees of freedom
## Residual deviance: 61649 on 60393 degrees of freedom
## AIC: 61671
##
## Number of Fisher Scoring iterations: 5
# evaluate the model
library(caret)
lr_pred <- predict(logsitic_regression_model, newdata = test_rm4, type = "response")
lr_pred <- ifelse(lr_pred >= 0.5, 1, 0)
# evaluation <- cbind(test, pred)
# evaluation$correct <- ifelse(evaluation$sentiment == evaluation$pred,1,0)
# sum(evaluation$correct) / nrow(evaluation)
confusionMatrix(data = lr_pred, test_rm4[, 'sentiment'])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7932 2117
## 1 1540 3513
##
## Accuracy : 0.7578
## 95% CI : (0.7509, 0.7647)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4712
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8374
## Specificity : 0.6240
## Pos Pred Value : 0.7893
## Neg Pred Value : 0.6952
## Prevalence : 0.6272
## Detection Rate : 0.5252
## Detection Prevalence : 0.6654
## Balanced Accuracy : 0.7307
##
## 'Positive' Class : 0
##
library (nnet)
# train the model without removing 4
neural_network_model <- nnet(formula = sentiment ~ ., data =train_0rm4, decay = 0.5, size = 6)
## # weights: 73
## initial value 53644.729157
## iter 10 value 47637.208736
## iter 20 value 46524.108951
## iter 30 value 44920.621501
## iter 40 value 43875.582198
## iter 50 value 43175.307769
## iter 60 value 42339.380476
## iter 70 value 41774.526227
## iter 80 value 41074.695671
## iter 90 value 40310.029867
## iter 100 value 39763.241093
## final value 39763.241093
## stopped after 100 iterations
# summary(neural_network_model)
# evaluate the model
nn_pred <- predict(neural_network_model, newdata = test_0rm4)
nn_pred <- ifelse(nn_pred >= 0.5, 1, 0)
# confusion matrix
confusionMatrix(data = nn_pred, test_0rm4[, 'sentiment'])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 20383 3719
## 1 513 1802
##
## Accuracy : 0.8398
## 95% CI : (0.8353, 0.8442)
## No Information Rate : 0.791
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3838
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9754
## Specificity : 0.3264
## Pos Pred Value : 0.8457
## Neg Pred Value : 0.7784
## Prevalence : 0.7910
## Detection Rate : 0.7716
## Detection Prevalence : 0.9124
## Balanced Accuracy : 0.6509
##
## 'Positive' Class : 0
##
# train the model removing 4
neural_network_model <- nnet(formula = sentiment ~ ., data =train_rm4, decay = 0.5, size = 6)
## # weights: 73
## initial value 42403.397320
## iter 10 value 34878.621142
## iter 20 value 33571.315969
## iter 30 value 32112.158414
## iter 40 value 30900.358568
## iter 50 value 29992.900287
## iter 60 value 29429.434779
## iter 70 value 29219.767765
## iter 80 value 28883.006736
## iter 90 value 28732.590234
## iter 100 value 28649.498357
## final value 28649.498357
## stopped after 100 iterations
# summary(neural_network_model)
# evaluate the model
nn_pred <- predict(neural_network_model, newdata = test_rm4)
nn_pred <- ifelse(nn_pred >= 0.5, 1, 0)
# confusion matrix
confusionMatrix(data = nn_pred, test_rm4[, 'sentiment'])
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8470 2375
## 1 1002 3255
##
## Accuracy : 0.7764
## 95% CI : (0.7697, 0.783)
## No Information Rate : 0.6272
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4969
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8942
## Specificity : 0.5782
## Pos Pred Value : 0.7810
## Neg Pred Value : 0.7646
## Prevalence : 0.6272
## Detection Rate : 0.5609
## Detection Prevalence : 0.7181
## Balanced Accuracy : 0.7362
##
## 'Positive' Class : 0
##