medicalmalpractice <- read.csv("Documents/CHULA/1-2567/RegTS/Assignment1/A1_medicalmalpractice.csv")
names(medicalmalpractice)
## [1] "Amount" "Severity" "Age" "Private.Attorney"
## [5] "Marital.Status" "Specialty" "Insurance" "Gender"
head(medicalmalpractice)
## Amount Severity Age Private.Attorney Marital.Status Specialty
## 1 57041 7 62 1 2 Family Practice
## 2 324976 6 38 1 2 OBGYN
## 3 135383 4 34 1 2 Cardiology
## 4 829742 7 42 1 1 Pediatrics
## 5 197675 3 60 0 2 OBGYN
## 6 75368 9 71 1 2 Internal Medicine
## Insurance Gender
## 1 Private Male
## 2 No Insurance Female
## 3 Unknown Male
## 4 No Insurance Female
## 5 Medicare/Medicaid Female
## 6 Medicare/Medicaid Female
medicalmalpractice2<-medicalmalpractice
medicalmalpractice2$Amount <- as.numeric(medicalmalpractice2$Amount)
medicalmalpractice2$Severity <-as.numeric(medicalmalpractice2$Severity)
medicalmalpractice2$Age <-as.numeric(medicalmalpractice2$Age)
medicalmalpractice2$Private.Attorney <-as.numeric(medicalmalpractice2$Private.Attorney)
medicalmalpractice2$Marital.Status <-as.numeric(medicalmalpractice2$Marital.Status)
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Family Practice" ]=0
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="OBGYN" ]=1
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Cardiology" ]=2
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Pediatrics" ]=3
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Internal Medicine" ]=4
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Anesthesiology" ]=5
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Emergency Medicine" ]=6
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Ophthamology" ]=7
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Urological Surgery" ]=8
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Orthopedic Surgery" ]=9
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Neurology/Neurosurgery" ]=10
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Occupational Medicine" ]=11
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Resident" ]=12
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Thoracic Surgery" ]=13
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="General Surgery" ]=14
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Radiology" ]=15
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Pathology" ]=16
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Physical Medicine" ]=17
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Plastic Surgeon" ]=18
medicalmalpractice2$Specialty[medicalmalpractice2$Specialty=="Dermatology" ]=19
medicalmalpractice2$Specialty <- as.numeric(medicalmalpractice2$Specialty)
medicalmalpractice2$Insurance[medicalmalpractice2$Insurance=="Medicare/Medicaid"]=0
medicalmalpractice2$Insurance[medicalmalpractice2$Insurance=="No Insurance"]=1
medicalmalpractice2$Insurance[medicalmalpractice2$Insurance=="Private"]=2
medicalmalpractice2$Insurance[medicalmalpractice2$Insurance=="Unknown"]=3
medicalmalpractice2$Insurance[medicalmalpractice2$Insurance=="Workers Compensation"]=4
medicalmalpractice2$Insurance <- as.numeric(medicalmalpractice2$Insurance)
medicalmalpractice2$Gender[medicalmalpractice2$Gender=="Male"]=0
medicalmalpractice2$Gender[medicalmalpractice2$Gender=="Female"]=1
medicalmalpractice2$Gender <- as.numeric(medicalmalpractice2$Gender)
hist(medicalmalpractice$Amount,main = "Histogram of Amount")
Since,this histogram is a left skewness, we need to transform data by using log-function.
medicalmalpractice2$log.Amount<-log(medicalmalpractice2$Amount)
hist(medicalmalpractice2$log.Amount,main="Histogram of log Amount")
hist(medicalmalpractice2$Age)
summary(medicalmalpractice2$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 28.0 43.0 42.7 58.0 87.0
After adjusting the data using the log function, it can be seen that the data is more symmetrical or can be called a normal distribution. We therefore chose to use log.amount as the dependent variable in the study.
If the relationship appears nonlinear and/or heteroscedastic, apply a transformation that fixes this.
plot(log.Amount~Age,data = medicalmalpractice2)
From the scatter plot of log.Amount and Age, it was found that there is a relationship between the log.amount and the age of the claimant. It can be observed that, as the age of the claimant increases as a result, the log.amount has decreased.
table(medicalmalpractice2$Severity)
##
## 1 2 3 4 5 6 7 8 9
## 665 1340 28251 15709 9615 3375 8873 3627 7755
boxplot(log.Amount~Severity,data = medicalmalpractice2,ylab="log.Amount",xlab="Severity")
table(medicalmalpractice2$Private.Attorney)
##
## 0 1
## 26861 52349
boxplot(log.Amount~Private.Attorney,data = medicalmalpractice2,ylab="log.Amount",xlab="Private.Attorney")
table(medicalmalpractice2$Marital.Status)
##
## 0 1 2 3 4
## 3832 22802 41220 994 10362
boxplot(log.Amount~Marital.Status,data = medicalmalpractice2,ylab="log.Amount",xlab="Marital.Status")
table(medicalmalpractice2$Specialty)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 11436 8876 2659 1416 5223 8732 4676 3289 2027 7272 4737 725 1983
## 13 14 15 16 17 18 19
## 664 9412 1979 714 642 1364 1384
boxplot(log.Amount~Specialty,data = medicalmalpractice2,ylab="log.Amount",xlab="Specialty")
table(medicalmalpractice2$Insurance)
##
## 0 1 2 3 4
## 10882 8002 34289 24052 1985
boxplot(log.Amount~Insurance,data = medicalmalpractice2,ylab="log.Amount",xlab="Type.insurance")
table(medicalmalpractice2$Gender)
##
## 0 1
## 31440 47770
boxplot(log.Amount~Gender,data = medicalmalpractice2,ylab="log.Amount",xlab="Gender")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
corrplot.mixed(cor(medicalmalpractice2),lower = "number", upper = "circle",lower.col = "black")
#library(graphics)
pairs(~log.Amount+Age,data = medicalmalpractice2,main="Simple Scatterplot Matrix")
co-variate for which p < 0.2;
any other co-variate which could be important, according to subject knowledge.
model.age = lm(log.Amount~Age,data = medicalmalpractice2)
summary(model.age)
##
## Call:
## lm(formula = log.Amount ~ Age, data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2065 -0.6843 0.0801 0.5724 2.4862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.760191 0.009227 1274.58 <2e-16 ***
## Age -0.008615 0.000196 -43.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.093 on 79208 degrees of freedom
## Multiple R-squared: 0.02381, Adjusted R-squared: 0.0238
## F-statistic: 1932 on 1 and 79208 DF, p-value: < 2.2e-16
model.severity = lm(log.Amount~factor(Severity),data = medicalmalpractice2)
summary(model.severity)
##
## Call:
## lm(formula = log.Amount ~ factor(Severity), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6271 -0.6303 0.0658 0.7345 2.7113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.3712491 0.0391317 290.589 < 2e-16 ***
## factor(Severity)2 -0.0005892 0.0478667 -0.012 0.99
## factor(Severity)3 -0.4280417 0.0395896 -10.812 < 2e-16 ***
## factor(Severity)4 -0.1842539 0.0399514 -4.612 4.00e-06 ***
## factor(Severity)5 0.2280030 0.0404623 5.635 1.76e-08 ***
## factor(Severity)6 0.7899040 0.0428137 18.450 < 2e-16 ***
## factor(Severity)7 0.8020122 0.0405716 19.768 < 2e-16 ***
## factor(Severity)8 0.6661950 0.0425682 15.650 < 2e-16 ***
## factor(Severity)9 0.2919843 0.0407750 7.161 8.09e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.009 on 79201 degrees of freedom
## Multiple R-squared: 0.1678, Adjusted R-squared: 0.1677
## F-statistic: 1996 on 8 and 79201 DF, p-value: < 2.2e-16
model.severity.III <- car::Anova(model.severity, type = 3)
model.severity.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 85988 1 84441.9 < 2.2e-16 ***
## factor(Severity) 16258 8 1995.7 < 2.2e-16 ***
## Residuals 80651 79201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.private.attorney = lm(log.Amount~factor(Private.Attorney),data = medicalmalpractice2)
summary(model.private.attorney)
##
## Call:
## lm(formula = log.Amount ~ factor(Private.Attorney), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4439 -0.6418 -0.0537 0.7145 2.9302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.806557 0.006245 1730.6 <2e-16 ***
## factor(Private.Attorney)1 0.886291 0.007681 115.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.023 on 79208 degrees of freedom
## Multiple R-squared: 0.1439, Adjusted R-squared: 0.1439
## F-statistic: 1.331e+04 on 1 and 79208 DF, p-value: < 2.2e-16
model.private.attorney.III<- car::Anova(model.private.attorney, type = 3)
model.private.attorney.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 3136873 1 2994827 < 2.2e-16 ***
## factor(Private.Attorney) 13945 1 13313 < 2.2e-16 ***
## Residuals 82965 79208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.maritial.status = lm(log.Amount~factor(Marital.Status),data = medicalmalpractice2)
summary(model.maritial.status)
##
## Call:
## lm(formula = log.Amount ~ factor(Marital.Status), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9308 -0.6734 0.0910 0.5634 2.8742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.34112 0.01727 714.63 <2e-16 ***
## factor(Marital.Status)1 -0.91874 0.01866 -49.23 <2e-16 ***
## factor(Marital.Status)2 -0.92033 0.01805 -50.98 <2e-16 ***
## factor(Marital.Status)3 -0.97289 0.03805 -25.57 <2e-16 ***
## factor(Marital.Status)4 -1.47694 0.02021 -73.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.069 on 79205 degrees of freedom
## Multiple R-squared: 0.06598, Adjusted R-squared: 0.06594
## F-statistic: 1399 on 4 and 79205 DF, p-value: < 2.2e-16
model.maritial.status.III<- car::Anova(model.maritial.status, type = 3)
model.maritial.status.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 583626 1 510702.0 < 2.2e-16 ***
## factor(Marital.Status) 6395 4 1398.9 < 2.2e-16 ***
## Residuals 90515 79205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.type.insurance = lm(log.Amount~factor(Insurance),data = medicalmalpractice2)
summary(model.type.insurance)
##
## Call:
## lm(formula = log.Amount ~ factor(Insurance), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4267 -0.6495 0.0495 0.7255 2.7174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.05668 0.01003 1101.825 < 2e-16 ***
## factor(Insurance)1 0.22581 0.01542 14.648 < 2e-16 ***
## factor(Insurance)2 0.73392 0.01152 63.721 < 2e-16 ***
## factor(Insurance)3 -0.03708 0.01209 -3.066 0.00217 **
## factor(Insurance)4 0.25387 0.02555 9.937 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.047 on 79205 degrees of freedom
## Multiple R-squared: 0.1044, Adjusted R-squared: 0.1043
## F-statistic: 2308 on 4 and 79205 DF, p-value: < 2.2e-16
model.type.insurance.III<- car::Anova(model.type.insurance, type = 3)
model.type.insurance.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 1330326 1 1214018.4 < 2.2e-16 ***
## factor(Insurance) 10116 4 2307.9 < 2.2e-16 ***
## Residuals 86793 79205
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.gender = lm(log.Amount~factor(Gender),data = medicalmalpractice2)
summary(model.gender)
##
## Call:
## lm(formula = log.Amount ~ factor(Gender), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1766 -0.6636 0.0527 0.5077 2.5719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.167147 0.006153 1815.05 <2e-16 ***
## factor(Gender)1 0.373334 0.007923 47.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.091 on 79208 degrees of freedom
## Multiple R-squared: 0.02727, Adjusted R-squared: 0.02726
## F-statistic: 2221 on 1 and 79208 DF, p-value: < 2.2e-16
model.gender.III<- car::Anova(model.gender, type = 3)
model.gender.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 3920731 1 3294412.7 < 2.2e-16 ***
## factor(Gender) 2643 1 2220.6 < 2.2e-16 ***
## Residuals 94267 79208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model.specialty = lm(log.Amount~factor(Specialty),data = medicalmalpractice2)
summary(model.specialty)
##
## Call:
## lm(formula = log.Amount ~ factor(Specialty), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6653 -0.6125 0.0330 0.6106 2.6046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.752570 0.009409 1249.026 < 2e-16 ***
## factor(Specialty)1 0.052545 0.014234 3.692 0.000223 ***
## factor(Specialty)2 -0.447982 0.021664 -20.679 < 2e-16 ***
## factor(Specialty)3 0.562448 0.028348 19.841 < 2e-16 ***
## factor(Specialty)4 -0.552005 0.016805 -32.849 < 2e-16 ***
## factor(Specialty)5 -1.221191 0.014300 -85.398 < 2e-16 ***
## factor(Specialty)6 -0.619976 0.017466 -35.496 < 2e-16 ***
## factor(Specialty)7 -0.692064 0.019909 -34.761 < 2e-16 ***
## factor(Specialty)8 0.228336 0.024250 9.416 < 2e-16 ***
## factor(Specialty)9 -0.430302 0.015092 -28.512 < 2e-16 ***
## factor(Specialty)10 0.107685 0.017386 6.194 5.91e-10 ***
## factor(Specialty)11 -0.413578 0.038537 -10.732 < 2e-16 ***
## factor(Specialty)12 -0.492518 0.024477 -20.122 < 2e-16 ***
## factor(Specialty)13 -0.417597 0.040167 -10.397 < 2e-16 ***
## factor(Specialty)14 -0.309552 0.014004 -22.105 < 2e-16 ***
## factor(Specialty)15 -1.434829 0.024498 -58.569 < 2e-16 ***
## factor(Specialty)16 -0.433296 0.038815 -11.163 < 2e-16 ***
## factor(Specialty)17 -0.380954 0.040812 -9.334 < 2e-16 ***
## factor(Specialty)18 -0.415773 0.028824 -14.424 < 2e-16 ***
## factor(Specialty)19 0.503631 0.028638 17.586 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.006 on 79190 degrees of freedom
## Multiple R-squared: 0.1726, Adjusted R-squared: 0.1724
## F-statistic: 869.6 on 19 and 79190 DF, p-value: < 2.2e-16
model.specialty.III<-car::Anova(model.specialty,type = 3)
model.specialty.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 1579574 1 1560066.46 < 2.2e-16 ***
## factor(Specialty) 16729 19 869.61 < 2.2e-16 ***
## Residuals 80180 79190
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model1 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)+factor(Specialty)+factor(Insurance)+
factor(Gender),data = medicalmalpractice2)
summary(model1)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status) + factor(Specialty) + factor(Insurance) +
## factor(Gender), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9017 -0.5247 0.0813 0.5842 3.0767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.2370540 0.0413480 271.768 < 2e-16 ***
## Age -0.0032889 0.0001739 -18.909 < 2e-16 ***
## factor(Severity)2 -0.0097951 0.0415787 -0.236 0.8138
## factor(Severity)3 0.0779337 0.0347715 2.241 0.0250 *
## factor(Severity)4 0.1460935 0.0349558 4.179 2.93e-05 ***
## factor(Severity)5 0.3074173 0.0352669 8.717 < 2e-16 ***
## factor(Severity)6 0.6573744 0.0372808 17.633 < 2e-16 ***
## factor(Severity)7 0.7738358 0.0356817 21.687 < 2e-16 ***
## factor(Severity)8 0.7024718 0.0372317 18.868 < 2e-16 ***
## factor(Severity)9 0.3680782 0.0355504 10.354 < 2e-16 ***
## factor(Private.Attorney)1 0.5782796 0.0088119 65.625 < 2e-16 ***
## factor(Marital.Status)1 -0.7112218 0.0163008 -43.631 < 2e-16 ***
## factor(Marital.Status)2 -0.6420651 0.0151951 -42.255 < 2e-16 ***
## factor(Marital.Status)3 -0.9084077 0.0318466 -28.525 < 2e-16 ***
## factor(Marital.Status)4 -0.7786859 0.0171209 -45.482 < 2e-16 ***
## factor(Specialty)1 0.1330818 0.0131103 10.151 < 2e-16 ***
## factor(Specialty)2 -0.4046167 0.0190074 -21.287 < 2e-16 ***
## factor(Specialty)3 0.3051253 0.0250067 12.202 < 2e-16 ***
## factor(Specialty)4 -0.1358548 0.0150662 -9.017 < 2e-16 ***
## factor(Specialty)5 -0.3831837 0.0140937 -27.188 < 2e-16 ***
## factor(Specialty)6 -0.1747114 0.0156465 -11.166 < 2e-16 ***
## factor(Specialty)7 -0.3223394 0.0177241 -18.187 < 2e-16 ***
## factor(Specialty)8 0.4965736 0.0218554 22.721 < 2e-16 ***
## factor(Specialty)9 -0.2663941 0.0132348 -20.128 < 2e-16 ***
## factor(Specialty)10 0.0654388 0.0152108 4.302 1.69e-05 ***
## factor(Specialty)11 0.1721087 0.0345701 4.979 6.42e-07 ***
## factor(Specialty)12 -0.2366024 0.0214669 -11.022 < 2e-16 ***
## factor(Specialty)13 -0.4387324 0.0351969 -12.465 < 2e-16 ***
## factor(Specialty)14 -0.1658826 0.0123499 -13.432 < 2e-16 ***
## factor(Specialty)15 -0.5545085 0.0223681 -24.790 < 2e-16 ***
## factor(Specialty)16 0.1487479 0.0348059 4.274 1.93e-05 ***
## factor(Specialty)17 -0.3911948 0.0357714 -10.936 < 2e-16 ***
## factor(Specialty)18 -0.1102876 0.0255993 -4.308 1.65e-05 ***
## factor(Specialty)19 0.6335575 0.0254137 24.930 < 2e-16 ***
## factor(Insurance)1 0.1913199 0.0129859 14.733 < 2e-16 ***
## factor(Insurance)2 0.4270579 0.0098674 43.280 < 2e-16 ***
## factor(Insurance)3 0.0713164 0.0105174 6.781 1.20e-11 ***
## factor(Insurance)4 -0.0438296 0.0215600 -2.033 0.0421 *
## factor(Gender)1 0.2889563 0.0084051 34.379 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8764 on 79171 degrees of freedom
## Multiple R-squared: 0.3726, Adjusted R-squared: 0.3723
## F-statistic: 1237 on 38 and 79171 DF, p-value: < 2.2e-16
model1.III = car::Anova(model1, type = 3)
model1.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 56724 1 73857.83 < 2.2e-16 ***
## Age 275 1 357.57 < 2.2e-16 ***
## factor(Severity) 4098 8 667.00 < 2.2e-16 ***
## factor(Private.Attorney) 3308 1 4306.62 < 2.2e-16 ***
## factor(Marital.Status) 1788 4 581.93 < 2.2e-16 ***
## factor(Specialty) 3981 19 272.81 < 2.2e-16 ***
## factor(Insurance) 2456 4 799.42 < 2.2e-16 ***
## factor(Gender) 908 1 1181.88 < 2.2e-16 ***
## Residuals 60804 79171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
After, we run a 1st model. We do a backward selection by eliminating variable which look the least promising, one at a time.
remove gender
model2 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)+factor(Specialty)+factor(Insurance)
,data = medicalmalpractice2)
summary(model2)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status) + factor(Specialty) + factor(Insurance),
## data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7750 -0.5286 0.0763 0.5960 2.9935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.4866526 0.0410080 280.108 < 2e-16 ***
## Age -0.0038593 0.0001744 -22.126 < 2e-16 ***
## factor(Severity)2 -0.0122239 0.0418876 -0.292 0.77042
## factor(Severity)3 -0.0158017 0.0349220 -0.452 0.65092
## factor(Severity)4 0.0635887 0.0351325 1.810 0.07030 .
## factor(Severity)5 0.2520922 0.0354919 7.103 1.23e-12 ***
## factor(Severity)6 0.6025886 0.0375235 16.059 < 2e-16 ***
## factor(Severity)7 0.6217620 0.0356695 17.431 < 2e-16 ***
## factor(Severity)8 0.6229196 0.0374359 16.640 < 2e-16 ***
## factor(Severity)9 0.2954049 0.0357512 8.263 < 2e-16 ***
## factor(Private.Attorney)1 0.5616775 0.0088640 63.366 < 2e-16 ***
## factor(Marital.Status)1 -0.6110156 0.0161573 -37.817 < 2e-16 ***
## factor(Marital.Status)2 -0.5838802 0.0152128 -38.381 < 2e-16 ***
## factor(Marital.Status)3 -0.7600641 0.0317873 -23.911 < 2e-16 ***
## factor(Marital.Status)4 -0.7558942 0.0172351 -43.858 < 2e-16 ***
## factor(Specialty)1 0.2358909 0.0128595 18.344 < 2e-16 ***
## factor(Specialty)2 -0.3567711 0.0190972 -18.682 < 2e-16 ***
## factor(Specialty)3 0.4077837 0.0250123 16.303 < 2e-16 ***
## factor(Specialty)4 -0.1471467 0.0151745 -9.697 < 2e-16 ***
## factor(Specialty)5 -0.4084195 0.0141791 -28.804 < 2e-16 ***
## factor(Specialty)6 -0.2334260 0.0156686 -14.898 < 2e-16 ***
## factor(Specialty)7 -0.2697730 0.0177892 -15.165 < 2e-16 ***
## factor(Specialty)8 0.5286700 0.0219977 24.033 < 2e-16 ***
## factor(Specialty)9 -0.3051730 0.0132846 -22.972 < 2e-16 ***
## factor(Specialty)10 0.0439634 0.0153109 2.871 0.00409 **
## factor(Specialty)11 0.2775299 0.0346896 8.000 1.26e-15 ***
## factor(Specialty)12 -0.2045458 0.0216060 -9.467 < 2e-16 ***
## factor(Specialty)13 -0.3370353 0.0353329 -9.539 < 2e-16 ***
## factor(Specialty)14 -0.1240135 0.0123810 -10.016 < 2e-16 ***
## factor(Specialty)15 -0.6814787 0.0222250 -30.663 < 2e-16 ***
## factor(Specialty)16 0.2533479 0.0349303 7.253 4.11e-13 ***
## factor(Specialty)17 -0.2906494 0.0359165 -8.092 5.93e-16 ***
## factor(Specialty)18 -0.0094062 0.0256195 -0.367 0.71351
## factor(Specialty)19 0.6315127 0.0256025 24.666 < 2e-16 ***
## factor(Insurance)1 0.2053427 0.0130759 15.704 < 2e-16 ***
## factor(Insurance)2 0.4082104 0.0099254 41.128 < 2e-16 ***
## factor(Insurance)3 -0.0080912 0.0103368 -0.783 0.43377
## factor(Insurance)4 -0.0619223 0.0217137 -2.852 0.00435 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8829 on 79172 degrees of freedom
## Multiple R-squared: 0.3632, Adjusted R-squared: 0.3629
## F-statistic: 1220 on 37 and 79172 DF, p-value: < 2.2e-16
model2.III = car::Anova(model2, type = 3)
model2.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 61157 1 78460.24 < 2.2e-16 ***
## Age 382 1 489.56 < 2.2e-16 ***
## factor(Severity) 3772 8 604.89 < 2.2e-16 ***
## factor(Private.Attorney) 3130 1 4015.24 < 2.2e-16 ***
## factor(Marital.Status) 1558 4 499.54 < 2.2e-16 ***
## factor(Specialty) 5085 19 343.34 < 2.2e-16 ***
## factor(Insurance) 2860 4 917.18 < 2.2e-16 ***
## Residuals 61712 79172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Insurance
model3 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)+factor(Specialty),data = medicalmalpractice2)
summary(model3)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status) + factor(Specialty), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6196 -0.5458 0.0628 0.5959 3.0439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.7192219 0.0411559 284.752 < 2e-16 ***
## Age -0.0043400 0.0001782 -24.353 < 2e-16 ***
## factor(Severity)2 -0.0078127 0.0428443 -0.182 0.85531
## factor(Severity)3 -0.0224104 0.0357029 -0.628 0.53021
## factor(Severity)4 0.0678515 0.0359184 1.889 0.05889 .
## factor(Severity)5 0.2776437 0.0362853 7.652 2.01e-14 ***
## factor(Severity)6 0.6626130 0.0383595 17.274 < 2e-16 ***
## factor(Severity)7 0.6732899 0.0364280 18.483 < 2e-16 ***
## factor(Severity)8 0.6731974 0.0382723 17.590 < 2e-16 ***
## factor(Severity)9 0.3154848 0.0365546 8.631 < 2e-16 ***
## factor(Private.Attorney)1 0.5851312 0.0089836 65.133 < 2e-16 ***
## factor(Marital.Status)1 -0.6470610 0.0164871 -39.247 < 2e-16 ***
## factor(Marital.Status)2 -0.6251202 0.0155384 -40.231 < 2e-16 ***
## factor(Marital.Status)3 -0.8050115 0.0324631 -24.798 < 2e-16 ***
## factor(Marital.Status)4 -0.8236961 0.0175881 -46.833 < 2e-16 ***
## factor(Specialty)1 0.2821569 0.0131084 21.525 < 2e-16 ***
## factor(Specialty)2 -0.3795534 0.0195218 -19.443 < 2e-16 ***
## factor(Specialty)3 0.4699420 0.0255446 18.397 < 2e-16 ***
## factor(Specialty)4 -0.1637169 0.0155165 -10.551 < 2e-16 ***
## factor(Specialty)5 -0.4544886 0.0144799 -31.387 < 2e-16 ***
## factor(Specialty)6 -0.2692053 0.0160134 -16.811 < 2e-16 ***
## factor(Specialty)7 -0.2944650 0.0181746 -16.202 < 2e-16 ***
## factor(Specialty)8 0.5831951 0.0224807 25.942 < 2e-16 ***
## factor(Specialty)9 -0.3386426 0.0135751 -24.946 < 2e-16 ***
## factor(Specialty)10 0.0414355 0.0156604 2.646 0.00815 **
## factor(Specialty)11 0.3279830 0.0354682 9.247 < 2e-16 ***
## factor(Specialty)12 -0.2224760 0.0220978 -10.068 < 2e-16 ***
## factor(Specialty)13 -0.3524514 0.0361226 -9.757 < 2e-16 ***
## factor(Specialty)14 -0.1280740 0.0126603 -10.116 < 2e-16 ***
## factor(Specialty)15 -0.7713810 0.0226737 -34.021 < 2e-16 ***
## factor(Specialty)16 0.2888289 0.0357216 8.086 6.27e-16 ***
## factor(Specialty)17 -0.3104816 0.0367211 -8.455 < 2e-16 ***
## factor(Specialty)18 0.0001662 0.0261904 0.006 0.99494
## factor(Specialty)19 0.6910922 0.0261684 26.409 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9031 on 79176 degrees of freedom
## Multiple R-squared: 0.3337, Adjusted R-squared: 0.3334
## F-statistic: 1202 on 33 and 79176 DF, p-value: < 2.2e-16
model3.III = car::Anova(model3, type = 3)
model3.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 66128 1 81083.84 < 2.2e-16 ***
## Age 484 1 593.06 < 2.2e-16 ***
## factor(Severity) 4552 8 697.66 < 2.2e-16 ***
## factor(Private.Attorney) 3460 1 4242.35 < 2.2e-16 ***
## factor(Marital.Status) 1838 4 563.55 < 2.2e-16 ***
## factor(Specialty) 6584 19 424.93 < 2.2e-16 ***
## Residuals 64572 79176
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Specialty
model4 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)
,data = medicalmalpractice2)
summary(model4)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7971 -0.5872 0.0325 0.6389 2.9534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.7552018 0.0417869 281.313 <2e-16 ***
## Age -0.0059086 0.0001859 -31.791 <2e-16 ***
## factor(Severity)2 -0.0019601 0.0449648 -0.044 0.9652
## factor(Severity)3 -0.0682039 0.0374458 -1.821 0.0685 .
## factor(Severity)4 0.0635747 0.0376780 1.687 0.0915 .
## factor(Severity)5 0.3417958 0.0380594 8.981 <2e-16 ***
## factor(Severity)6 0.7425748 0.0402323 18.457 <2e-16 ***
## factor(Severity)7 0.7508217 0.0381547 19.678 <2e-16 ***
## factor(Severity)8 0.7712936 0.0401200 19.225 <2e-16 ***
## factor(Severity)9 0.3501579 0.0383397 9.133 <2e-16 ***
## factor(Private.Attorney)1 0.5768932 0.0079917 72.186 <2e-16 ***
## factor(Marital.Status)1 -0.7184381 0.0170655 -42.099 <2e-16 ***
## factor(Marital.Status)2 -0.6930925 0.0161698 -42.863 <2e-16 ***
## factor(Marital.Status)3 -0.8844600 0.0339471 -26.054 <2e-16 ***
## factor(Marital.Status)4 -0.9811496 0.0182828 -53.665 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9479 on 79195 degrees of freedom
## Multiple R-squared: 0.2657, Adjusted R-squared: 0.2656
## F-statistic: 2047 on 14 and 79195 DF, p-value: < 2.2e-16
model4.III = car::Anova(model4, type = 3)
model4.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 71104 1 79137.14 < 2.2e-16 ***
## Age 908 1 1010.64 < 2.2e-16 ***
## factor(Severity) 6520 8 907.12 < 2.2e-16 ***
## factor(Private.Attorney) 4682 1 5210.88 < 2.2e-16 ***
## factor(Marital.Status) 2631 4 732.07 < 2.2e-16 ***
## Residuals 71156 79195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Marital Status
model5 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney),data = medicalmalpractice2)
summary(model5)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney),
## data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1546 -0.6050 0.0171 0.6598 2.9456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.0019939 0.0390354 281.847 <2e-16 ***
## Age -0.0059210 0.0001743 -33.966 <2e-16 ***
## factor(Severity)2 0.0052617 0.0457854 0.115 0.9085
## factor(Severity)3 -0.0703808 0.0381231 -1.846 0.0649 .
## factor(Severity)4 0.0832507 0.0383566 2.170 0.0300 *
## factor(Severity)5 0.3579685 0.0387471 9.239 <2e-16 ***
## factor(Severity)6 0.7950473 0.0409533 19.414 <2e-16 ***
## factor(Severity)7 0.8138196 0.0388078 20.971 <2e-16 ***
## factor(Severity)8 0.8693059 0.0407981 21.308 <2e-16 ***
## factor(Severity)9 0.3975908 0.0390227 10.189 <2e-16 ***
## factor(Private.Attorney)1 0.6119364 0.0080088 76.408 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9652 on 79199 degrees of freedom
## Multiple R-squared: 0.2386, Adjusted R-squared: 0.2385
## F-statistic: 2482 on 10 and 79199 DF, p-value: < 2.2e-16
model5.III = car::Anova(model5, type = 3)
model5.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 74009 1 79437.5 < 2.2e-16 ***
## Age 1075 1 1153.7 < 2.2e-16 ***
## factor(Severity) 7810 8 1047.9 < 2.2e-16 ***
## factor(Private.Attorney) 5439 1 5838.2 < 2.2e-16 ***
## Residuals 73787 79199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Private Attorney
model6 = lm(log.Amount~Age+factor(Severity),data = medicalmalpractice2)
summary(model6)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5803 -0.6229 0.0567 0.7194 2.7514
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.6500486 0.0394820 295.072 < 2e-16 ***
## Age -0.0068022 0.0001802 -37.741 < 2e-16 ***
## factor(Severity)2 0.0061325 0.0474427 0.129 0.897
## factor(Severity)3 -0.4033570 0.0392441 -10.278 < 2e-16 ***
## factor(Severity)4 -0.1669751 0.0395998 -4.217 2.48e-05 ***
## factor(Severity)5 0.2176989 0.0401045 5.428 5.71e-08 ***
## factor(Severity)6 0.7791488 0.0424351 18.361 < 2e-16 ***
## factor(Severity)7 0.7994931 0.0402120 19.882 < 2e-16 ***
## factor(Severity)8 0.6733387 0.0421912 15.959 < 2e-16 ***
## factor(Severity)9 0.3020775 0.0404144 7.475 7.83e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1 on 79200 degrees of freedom
## Multiple R-squared: 0.1825, Adjusted R-squared: 0.1824
## F-statistic: 1964 on 9 and 79200 DF, p-value: < 2.2e-16
model6.III = car::Anova(model5, type = 3)
model6.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 74009 1 79437.5 < 2.2e-16 ***
## Age 1075 1 1153.7 < 2.2e-16 ***
## factor(Severity) 7810 8 1047.9 < 2.2e-16 ***
## factor(Private.Attorney) 5439 1 5838.2 < 2.2e-16 ***
## Residuals 73787 79199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Severity
similar to model age
model7 = lm(log.Amount~Age,data = medicalmalpractice2)
summary(model7)
##
## Call:
## lm(formula = log.Amount ~ Age, data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2065 -0.6843 0.0801 0.5724 2.4862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.760191 0.009227 1274.58 <2e-16 ***
## Age -0.008615 0.000196 -43.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.093 on 79208 degrees of freedom
## Multiple R-squared: 0.02381, Adjusted R-squared: 0.0238
## F-statistic: 1932 on 1 and 79208 DF, p-value: < 2.2e-16
model7.III = car::Anova(model7, type = 3)
model7.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 1940279 1 1624552 < 2.2e-16 ***
## Age 2307 1 1932 < 2.2e-16 ***
## Residuals 94602 79208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From box plot 2 to 6, it can be seen that when reducing one variable at a time, the Adjusted R-squared value will decrease accordingly.
After,we do backward selection. We try to reduce only one variable on every model.
remove Private.Attorney
model8 = lm(log.Amount~Age+factor(Severity)+
factor(Marital.Status)+factor(Specialty)+factor(Insurance)+
factor(Gender),data = medicalmalpractice2)
summary(model8)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Marital.Status) +
## factor(Specialty) + factor(Insurance) + factor(Gender), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3630 -0.5307 0.0850 0.6044 3.0868
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.9537187 0.0409498 291.912 < 2e-16 ***
## Age -0.0039958 0.0001783 -22.417 < 2e-16 ***
## factor(Severity)2 -0.0126418 0.0426943 -0.296 0.76715
## factor(Severity)3 -0.1422028 0.0355380 -4.001 6.30e-05 ***
## factor(Severity)4 -0.0260597 0.0357926 -0.728 0.46657
## factor(Severity)5 0.2042260 0.0361771 5.645 1.66e-08 ***
## factor(Severity)6 0.6421999 0.0382804 16.776 < 2e-16 ***
## factor(Severity)7 0.7408774 0.0366354 20.223 < 2e-16 ***
## factor(Severity)8 0.5660483 0.0381711 14.829 < 2e-16 ***
## factor(Severity)9 0.2925145 0.0364851 8.017 1.09e-15 ***
## factor(Marital.Status)1 -0.7386569 0.0167327 -44.145 < 2e-16 ***
## factor(Marital.Status)2 -0.6426351 0.0156028 -41.187 < 2e-16 ***
## factor(Marital.Status)3 -0.7847173 0.0326437 -24.039 < 2e-16 ***
## factor(Marital.Status)4 -0.8486884 0.0175461 -48.369 < 2e-16 ***
## factor(Specialty)1 -0.0152752 0.0132604 -1.152 0.24935
## factor(Specialty)2 -0.3459458 0.0194958 -17.745 < 2e-16 ***
## factor(Specialty)3 0.3368405 0.0256729 13.120 < 2e-16 ***
## factor(Specialty)4 -0.3344122 0.0151552 -22.066 < 2e-16 ***
## factor(Specialty)5 -0.7340793 0.0133899 -54.823 < 2e-16 ***
## factor(Specialty)6 -0.3431824 0.0158486 -21.654 < 2e-16 ***
## factor(Specialty)7 -0.4639409 0.0180643 -25.683 < 2e-16 ***
## factor(Specialty)8 0.1694616 0.0218504 7.756 8.90e-15 ***
## factor(Specialty)9 -0.2724202 0.0135896 -20.046 < 2e-16 ***
## factor(Specialty)10 0.0782914 0.0156177 5.013 5.37e-07 ***
## factor(Specialty)11 -0.2798177 0.0347862 -8.044 8.82e-16 ***
## factor(Specialty)12 -0.3564365 0.0219630 -16.229 < 2e-16 ***
## factor(Specialty)13 -0.3766949 0.0361282 -10.427 < 2e-16 ***
## factor(Specialty)14 -0.2309076 0.0126404 -18.267 < 2e-16 ***
## factor(Specialty)15 -0.8086908 0.0226213 -35.749 < 2e-16 ***
## factor(Specialty)16 -0.3065071 0.0350227 -8.752 < 2e-16 ***
## factor(Specialty)17 -0.3343846 0.0367204 -9.106 < 2e-16 ***
## factor(Specialty)18 -0.3175029 0.0260854 -12.172 < 2e-16 ***
## factor(Specialty)19 0.3548371 0.0257286 13.792 < 2e-16 ***
## factor(Insurance)1 0.1427332 0.0133126 10.722 < 2e-16 ***
## factor(Insurance)2 0.4415222 0.0101296 43.587 < 2e-16 ***
## factor(Insurance)3 0.0431195 0.0107906 3.996 6.45e-05 ***
## factor(Insurance)4 0.0666996 0.0220708 3.022 0.00251 **
## factor(Gender)1 0.2587275 0.0086177 30.023 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8999 on 79172 degrees of freedom
## Multiple R-squared: 0.3384, Adjusted R-squared: 0.3381
## F-statistic: 1095 on 37 and 79172 DF, p-value: < 2.2e-16
model8.III = car::Anova(model8, type = 3)
model8.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 69003 1 85212.38 < 2.2e-16 ***
## Age 407 1 502.50 < 2.2e-16 ***
## factor(Severity) 6687 8 1032.22 < 2.2e-16 ***
## factor(Marital.Status) 2024 4 624.90 < 2.2e-16 ***
## factor(Specialty) 5104 19 331.76 < 2.2e-16 ***
## factor(Insurance) 2798 4 863.95 < 2.2e-16 ***
## factor(Gender) 730 1 901.37 < 2.2e-16 ***
## Residuals 64112 79172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Severity
model9 = lm(log.Amount~Age+factor(Private.Attorney)+
factor(Marital.Status)+factor(Specialty)+factor(Insurance)+
factor(Gender),data = medicalmalpractice2)
summary(model9)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Private.Attorney) + factor(Marital.Status) +
## factor(Specialty) + factor(Insurance) + factor(Gender), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5790 -0.5496 0.0589 0.5816 3.3033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.5540153 0.0229902 502.563 < 2e-16 ***
## Age -0.0038640 0.0001794 -21.536 < 2e-16 ***
## factor(Private.Attorney)1 0.7308660 0.0086173 84.814 < 2e-16 ***
## factor(Marital.Status)1 -0.8102968 0.0167337 -48.423 < 2e-16 ***
## factor(Marital.Status)2 -0.7340057 0.0156050 -47.036 < 2e-16 ***
## factor(Marital.Status)3 -1.0441128 0.0328381 -31.796 < 2e-16 ***
## factor(Marital.Status)4 -0.8945727 0.0175808 -50.884 < 2e-16 ***
## factor(Specialty)1 0.1617523 0.0135309 11.954 < 2e-16 ***
## factor(Specialty)2 -0.4697343 0.0196083 -23.956 < 2e-16 ***
## factor(Specialty)3 0.3594479 0.0257914 13.937 < 2e-16 ***
## factor(Specialty)4 -0.1645730 0.0155513 -10.583 < 2e-16 ***
## factor(Specialty)5 -0.4467280 0.0145201 -30.766 < 2e-16 ***
## factor(Specialty)6 -0.2075833 0.0161500 -12.853 < 2e-16 ***
## factor(Specialty)7 -0.3825499 0.0182813 -20.926 < 2e-16 ***
## factor(Specialty)8 0.5553710 0.0225416 24.638 < 2e-16 ***
## factor(Specialty)9 -0.3058662 0.0136583 -22.394 < 2e-16 ***
## factor(Specialty)10 0.0645746 0.0156888 4.116 3.86e-05 ***
## factor(Specialty)11 0.1749780 0.0357094 4.900 9.60e-07 ***
## factor(Specialty)12 -0.2707255 0.0221694 -12.212 < 2e-16 ***
## factor(Specialty)13 -0.4995212 0.0363441 -13.744 < 2e-16 ***
## factor(Specialty)14 -0.1901972 0.0127528 -14.914 < 2e-16 ***
## factor(Specialty)15 -0.6515576 0.0230586 -28.257 < 2e-16 ***
## factor(Specialty)16 0.1722823 0.0359492 4.792 1.65e-06 ***
## factor(Specialty)17 -0.4643031 0.0369247 -12.574 < 2e-16 ***
## factor(Specialty)18 -0.1365591 0.0264394 -5.165 2.41e-07 ***
## factor(Specialty)19 0.7275559 0.0261996 27.770 < 2e-16 ***
## factor(Insurance)1 0.2261872 0.0134018 16.877 < 2e-16 ***
## factor(Insurance)2 0.4936135 0.0101435 48.663 < 2e-16 ***
## factor(Insurance)3 0.0826797 0.0108568 7.615 2.66e-14 ***
## factor(Insurance)4 -0.0438233 0.0222716 -1.968 0.0491 *
## factor(Gender)1 0.2232372 0.0083814 26.635 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9054 on 79179 degrees of freedom
## Multiple R-squared: 0.3303, Adjusted R-squared: 0.33
## F-statistic: 1302 on 30 and 79179 DF, p-value: < 2.2e-16
model9.III = car::Anova(model9, type = 3)
model9.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 207029 1 252569.71 < 2.2e-16 ***
## Age 380 1 463.81 < 2.2e-16 ***
## factor(Private.Attorney) 5896 1 7193.41 < 2.2e-16 ***
## factor(Marital.Status) 2379 4 725.57 < 2.2e-16 ***
## factor(Specialty) 5458 19 350.43 < 2.2e-16 ***
## factor(Insurance) 3316 4 1011.31 < 2.2e-16 ***
## factor(Gender) 582 1 709.42 < 2.2e-16 ***
## Residuals 64902 79179
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Age
model10 = lm(log.Amount~factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)+factor(Specialty)+factor(Insurance)+
factor(Gender),data = medicalmalpractice2)
summary(model10)
##
## Call:
## lm(formula = log.Amount ~ factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status) + factor(Specialty) + factor(Insurance) +
## factor(Gender), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9321 -0.5269 0.0806 0.5853 3.1125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.062129 0.040391 273.879 < 2e-16 ***
## factor(Severity)2 -0.014854 0.041671 -0.356 0.721
## factor(Severity)3 0.081090 0.034849 2.327 0.020 *
## factor(Severity)4 0.150509 0.035034 4.296 1.74e-05 ***
## factor(Severity)5 0.314983 0.035344 8.912 < 2e-16 ***
## factor(Severity)6 0.663100 0.037363 17.747 < 2e-16 ***
## factor(Severity)7 0.787120 0.035755 22.014 < 2e-16 ***
## factor(Severity)8 0.709937 0.037313 19.026 < 2e-16 ***
## factor(Severity)9 0.372402 0.035630 10.452 < 2e-16 ***
## factor(Private.Attorney)1 0.588599 0.008815 66.774 < 2e-16 ***
## factor(Marital.Status)1 -0.662038 0.016128 -41.049 < 2e-16 ***
## factor(Marital.Status)2 -0.643811 0.015229 -42.275 < 2e-16 ***
## factor(Marital.Status)3 -0.923352 0.031908 -28.938 < 2e-16 ***
## factor(Marital.Status)4 -0.780304 0.017159 -45.474 < 2e-16 ***
## factor(Specialty)1 0.133842 0.013140 10.186 < 2e-16 ***
## factor(Specialty)2 -0.413883 0.019044 -21.733 < 2e-16 ***
## factor(Specialty)3 0.306106 0.025063 12.213 < 2e-16 ***
## factor(Specialty)4 -0.139792 0.015099 -9.259 < 2e-16 ***
## factor(Specialty)5 -0.395564 0.014110 -28.034 < 2e-16 ***
## factor(Specialty)6 -0.181164 0.015678 -11.555 < 2e-16 ***
## factor(Specialty)7 -0.331067 0.017758 -18.643 < 2e-16 ***
## factor(Specialty)8 0.498243 0.021904 22.746 < 2e-16 ***
## factor(Specialty)9 -0.271327 0.013262 -20.459 < 2e-16 ***
## factor(Specialty)10 0.062654 0.015244 4.110 3.96e-05 ***
## factor(Specialty)11 0.175370 0.034647 5.062 4.17e-07 ***
## factor(Specialty)12 -0.236328 0.021515 -10.984 < 2e-16 ***
## factor(Specialty)13 -0.449145 0.035272 -12.734 < 2e-16 ***
## factor(Specialty)14 -0.167283 0.012377 -13.515 < 2e-16 ***
## factor(Specialty)15 -0.565370 0.022411 -25.227 < 2e-16 ***
## factor(Specialty)16 0.150438 0.034884 4.313 1.62e-05 ***
## factor(Specialty)17 -0.401374 0.035848 -11.197 < 2e-16 ***
## factor(Specialty)18 -0.112169 0.025657 -4.372 1.23e-05 ***
## factor(Specialty)19 0.635291 0.025471 24.942 < 2e-16 ***
## factor(Insurance)1 0.193104 0.013015 14.837 < 2e-16 ***
## factor(Insurance)2 0.433524 0.009884 43.863 < 2e-16 ***
## factor(Insurance)3 0.074508 0.010540 7.069 1.57e-12 ***
## factor(Insurance)4 -0.039876 0.021607 -1.845 0.065 .
## factor(Gender)1 0.304117 0.008386 36.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8783 on 79172 degrees of freedom
## Multiple R-squared: 0.3697, Adjusted R-squared: 0.3694
## F-statistic: 1255 on 37 and 79172 DF, p-value: < 2.2e-16
model10.III = car::Anova(model10, type = 3)
model10.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 57868 1 75009.87 < 2.2e-16 ***
## factor(Severity) 4204 8 681.12 < 2.2e-16 ***
## factor(Private.Attorney) 3440 1 4458.80 < 2.2e-16 ***
## factor(Marital.Status) 1743 4 564.72 < 2.2e-16 ***
## factor(Specialty) 4116 19 280.78 < 2.2e-16 ***
## factor(Insurance) 2516 4 815.29 < 2.2e-16 ***
## factor(Gender) 1015 1 1315.25 < 2.2e-16 ***
## Residuals 61079 79172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Insurance
model11 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)+factor(Specialty)
,data = medicalmalpractice2)
summary(model11)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status) + factor(Specialty), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6196 -0.5458 0.0628 0.5959 3.0439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.7192219 0.0411559 284.752 < 2e-16 ***
## Age -0.0043400 0.0001782 -24.353 < 2e-16 ***
## factor(Severity)2 -0.0078127 0.0428443 -0.182 0.85531
## factor(Severity)3 -0.0224104 0.0357029 -0.628 0.53021
## factor(Severity)4 0.0678515 0.0359184 1.889 0.05889 .
## factor(Severity)5 0.2776437 0.0362853 7.652 2.01e-14 ***
## factor(Severity)6 0.6626130 0.0383595 17.274 < 2e-16 ***
## factor(Severity)7 0.6732899 0.0364280 18.483 < 2e-16 ***
## factor(Severity)8 0.6731974 0.0382723 17.590 < 2e-16 ***
## factor(Severity)9 0.3154848 0.0365546 8.631 < 2e-16 ***
## factor(Private.Attorney)1 0.5851312 0.0089836 65.133 < 2e-16 ***
## factor(Marital.Status)1 -0.6470610 0.0164871 -39.247 < 2e-16 ***
## factor(Marital.Status)2 -0.6251202 0.0155384 -40.231 < 2e-16 ***
## factor(Marital.Status)3 -0.8050115 0.0324631 -24.798 < 2e-16 ***
## factor(Marital.Status)4 -0.8236961 0.0175881 -46.833 < 2e-16 ***
## factor(Specialty)1 0.2821569 0.0131084 21.525 < 2e-16 ***
## factor(Specialty)2 -0.3795534 0.0195218 -19.443 < 2e-16 ***
## factor(Specialty)3 0.4699420 0.0255446 18.397 < 2e-16 ***
## factor(Specialty)4 -0.1637169 0.0155165 -10.551 < 2e-16 ***
## factor(Specialty)5 -0.4544886 0.0144799 -31.387 < 2e-16 ***
## factor(Specialty)6 -0.2692053 0.0160134 -16.811 < 2e-16 ***
## factor(Specialty)7 -0.2944650 0.0181746 -16.202 < 2e-16 ***
## factor(Specialty)8 0.5831951 0.0224807 25.942 < 2e-16 ***
## factor(Specialty)9 -0.3386426 0.0135751 -24.946 < 2e-16 ***
## factor(Specialty)10 0.0414355 0.0156604 2.646 0.00815 **
## factor(Specialty)11 0.3279830 0.0354682 9.247 < 2e-16 ***
## factor(Specialty)12 -0.2224760 0.0220978 -10.068 < 2e-16 ***
## factor(Specialty)13 -0.3524514 0.0361226 -9.757 < 2e-16 ***
## factor(Specialty)14 -0.1280740 0.0126603 -10.116 < 2e-16 ***
## factor(Specialty)15 -0.7713810 0.0226737 -34.021 < 2e-16 ***
## factor(Specialty)16 0.2888289 0.0357216 8.086 6.27e-16 ***
## factor(Specialty)17 -0.3104816 0.0367211 -8.455 < 2e-16 ***
## factor(Specialty)18 0.0001662 0.0261904 0.006 0.99494
## factor(Specialty)19 0.6910922 0.0261684 26.409 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9031 on 79176 degrees of freedom
## Multiple R-squared: 0.3337, Adjusted R-squared: 0.3334
## F-statistic: 1202 on 33 and 79176 DF, p-value: < 2.2e-16
model11.III = car::Anova(model11, type = 3)
model11.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 66128 1 81083.84 < 2.2e-16 ***
## Age 484 1 593.06 < 2.2e-16 ***
## factor(Severity) 4552 8 697.66 < 2.2e-16 ***
## factor(Private.Attorney) 3460 1 4242.35 < 2.2e-16 ***
## factor(Marital.Status) 1838 4 563.55 < 2.2e-16 ***
## factor(Specialty) 6584 19 424.93 < 2.2e-16 ***
## Residuals 64572 79176
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
remove Specialty
model12 = lm(log.Amount~Age+factor(Severity)+factor(Private.Attorney)+
factor(Marital.Status)
,data = medicalmalpractice2)
summary(model12)
##
## Call:
## lm(formula = log.Amount ~ Age + factor(Severity) + factor(Private.Attorney) +
## factor(Marital.Status), data = medicalmalpractice2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7971 -0.5872 0.0325 0.6389 2.9534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.7552018 0.0417869 281.313 <2e-16 ***
## Age -0.0059086 0.0001859 -31.791 <2e-16 ***
## factor(Severity)2 -0.0019601 0.0449648 -0.044 0.9652
## factor(Severity)3 -0.0682039 0.0374458 -1.821 0.0685 .
## factor(Severity)4 0.0635747 0.0376780 1.687 0.0915 .
## factor(Severity)5 0.3417958 0.0380594 8.981 <2e-16 ***
## factor(Severity)6 0.7425748 0.0402323 18.457 <2e-16 ***
## factor(Severity)7 0.7508217 0.0381547 19.678 <2e-16 ***
## factor(Severity)8 0.7712936 0.0401200 19.225 <2e-16 ***
## factor(Severity)9 0.3501579 0.0383397 9.133 <2e-16 ***
## factor(Private.Attorney)1 0.5768932 0.0079917 72.186 <2e-16 ***
## factor(Marital.Status)1 -0.7184381 0.0170655 -42.099 <2e-16 ***
## factor(Marital.Status)2 -0.6930925 0.0161698 -42.863 <2e-16 ***
## factor(Marital.Status)3 -0.8844600 0.0339471 -26.054 <2e-16 ***
## factor(Marital.Status)4 -0.9811496 0.0182828 -53.665 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9479 on 79195 degrees of freedom
## Multiple R-squared: 0.2657, Adjusted R-squared: 0.2656
## F-statistic: 2047 on 14 and 79195 DF, p-value: < 2.2e-16
model12.III = car::Anova(model12, type = 3)
model12.III
## Anova Table (Type III tests)
##
## Response: log.Amount
## Sum Sq Df F value Pr(>F)
## (Intercept) 71104 1 79137.14 < 2.2e-16 ***
## Age 908 1 1010.64 < 2.2e-16 ***
## factor(Severity) 6520 8 907.12 < 2.2e-16 ***
## factor(Private.Attorney) 4682 1 5210.88 < 2.2e-16 ***
## factor(Marital.Status) 2631 4 732.07 < 2.2e-16 ***
## Residuals 71156 79195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Diagnostic plots produced by lm
par(mfrow=c(3,2)) # 3 rows, 2 columns for plots
plot(model1,which = 1:6)
par(mfrow=c(1,2)) # 1 rows, 2 columns for plots
plot(model1,which = 1:2)
plot(model1,which = 4:5)
par(mfrow=c(1,1)) # 1 rows, 2 columns for plots
plot(model1,which = 1)
plot(model1,which = 3)
Goldfeld-Quandt test
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
gqtest(model1)
##
## Goldfeld-Quandt test
##
## data: model1
## GQ = 1.0039, df1 = 39566, df2 = 39566, p-value = 0.3495
## alternative hypothesis: variance increases from segment 1 to 2
Durbin-Watson test
dwtest(model1)
##
## Durbin-Watson test
##
## data: model1
## DW = 1.9966, p-value = 0.3139
## alternative hypothesis: true autocorrelation is greater than 0
Anderson-Darling test for normality
library(nortest)
plot(model1,which = 2)
model1.residuals<-model1$residuals
ad.test (model1.residuals)
##
## Anderson-Darling normality test
##
## data: model1.residuals
## A = 241.89, p-value < 2.2e-16
VIF
y<-car::vif(model2)
y
## GVIF Df GVIF^(1/(2*Df))
## Age 1.213383 1 1.101537
## factor(Severity) 1.368522 8 1.019802
## factor(Private.Attorney) 1.789432 1 1.337697
## factor(Marital.Status) 1.358924 4 1.039081
## factor(Specialty) 1.827278 19 1.015990
## factor(Insurance) 1.166865 4 1.019477
v<-y^2
v
## GVIF Df GVIF^(1/(2*Df))
## Age 1.472298 1 1.213383
## factor(Severity) 1.872852 64 1.039996
## factor(Private.Attorney) 3.202067 1 1.789432
## factor(Marital.Status) 1.846676 16 1.079689
## factor(Specialty) 3.338944 361 1.032236
## factor(Insurance) 1.361574 16 1.039334