#importing data sets
hosp<-read.csv("C:/Users/Vaibhav Goyal/Desktop/simpl/DATA SCIENCE WITH R/Healthcare/HospitalCosts.csv",header = T)
head(hosp)
## AGE FEMALE LOS RACE TOTCHG APRDRG
## 1 17 1 2 1 2660 560
## 2 17 0 2 1 1689 753
## 3 17 1 7 1 20060 930
## 4 17 1 1 1 736 758
## 5 17 1 1 1 1194 754
## 6 17 0 0 1 3305 347
summary(hosp)
## AGE FEMALE LOS RACE
## Min. : 0.000 Min. :0.000 Min. : 0.000 Min. :1.000
## 1st Qu.: 0.000 1st Qu.:0.000 1st Qu.: 2.000 1st Qu.:1.000
## Median : 0.000 Median :1.000 Median : 2.000 Median :1.000
## Mean : 5.086 Mean :0.512 Mean : 2.828 Mean :1.078
## 3rd Qu.:13.000 3rd Qu.:1.000 3rd Qu.: 3.000 3rd Qu.:1.000
## Max. :17.000 Max. :1.000 Max. :41.000 Max. :6.000
## NA's :1
## TOTCHG APRDRG
## Min. : 532 Min. : 21.0
## 1st Qu.: 1216 1st Qu.:640.0
## Median : 1536 Median :640.0
## Mean : 2774 Mean :616.4
## 3rd Qu.: 2530 3rd Qu.:751.0
## Max. :48388 Max. :952.0
##
attach(hosp)
#1 insight
hist(AGE)

#to see the value of category of infants
ag<-as.factor(AGE)
summary(ag)
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## 307 10 1 3 2 2 2 3 2 2 4 8 15 18 25 29 29 38
#age category of 0 seems to be frequently using the hospital
tapply(TOTCHG,AGE,sum)
## 0 1 2 3 4 5 6 7 8 9
## 678118 37744 7298 30550 15992 18507 17928 10087 4741 21147
## 10 11 12 13 14 15 16 17
## 24469 14250 54912 31135 64643 111747 69149 174777
which.max(tapply(TOTCHG,AGE,sum))
## 0
## 1
#max expenditure also by infant of 0 age =678118, 15=111747 17=174777
#2insight
diagg<-as.factor(APRDRG)
summary(diagg)
## 21 23 49 50 51 53 54 57 58 92 97 114 115 137 138 139 141 143
## 1 1 1 1 1 10 1 2 1 1 1 1 2 1 4 5 1 1
## 204 206 225 249 254 308 313 317 344 347 420 421 422 560 561 566 580 581
## 1 1 2 6 1 1 1 1 2 3 2 1 3 2 1 1 1 3
## 602 614 626 633 634 636 639 640 710 720 723 740 750 751 753 754 755 756
## 1 3 6 4 2 3 4 267 1 1 2 1 1 14 36 37 13 2
## 758 760 776 811 812 863 911 930 952
## 20 2 1 2 3 1 1 2 1
which.max(summary(diagg))
## 640
## 44
tapply(TOTCHG,diagg,sum)
## 21 23 49 50 51 53 54 57 58 92
## 10002 14174 20195 3908 3023 82271 851 14509 2117 12024
## 97 114 115 137 138 139 141 143 204 206
## 9530 10562 25832 15129 13622 17766 2860 1393 8439 9230
## 225 249 254 308 313 317 344 347 420 421
## 25649 16642 615 10585 8159 17524 14802 12597 6357 26356
## 422 560 561 566 580 581 602 614 626 633
## 5177 4877 2296 2129 2825 7453 29188 27531 23289 17591
## 634 636 639 640 710 720 723 740 750 751
## 9952 23224 12612 437978 8223 14243 5289 11125 1753 21666
## 753 754 755 756 758 760 776 811 812 863
## 79542 59150 11168 1494 34953 8273 1193 3838 9524 13040
## 911 930 952
## 48388 26654 4833
which.max(tapply(TOTCHG,diagg,sum))
## 640
## 44
max(tapply(TOTCHG,diagg,sum))
## [1] 437978
#From the results we can see that the category 640 has the maximum entries of hospitalization
#and also has the highest total hospitalization cost (437978).
#h0:The race of the patient is related to the hospitalization costs.
#ha:no relation
rc<-as.factor(RACE)
summary(rc)
## 1 2 3 4 5 6 NA's
## 484 6 1 3 3 2 1
#now to omit na values from data set
hospna<-na.omit(hosp)
modelannova<-aov(TOTCHG~RACE)
summary(modelannova)
## Df Sum Sq Mean Sq F value Pr(>F)
## RACE 1 2.488e+06 2488459 0.164 0.686
## Residuals 497 7.540e+09 15170268
## 1 observation deleted due to missingness
#pvalue comes out to be very high 68% this means we can take risk and reject the null hypothesis
#this means there is no relation between the race of patient and the hospital cost.
modelm1<-lm(TOTCHG~AGE+FEMALE)
summary(modelm1)
##
## Call:
## lm(formula = TOTCHG ~ AGE + FEMALE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3406 -1443 -869 -152 44951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2718.63 261.14 10.411 < 2e-16 ***
## AGE 86.28 25.48 3.387 0.000763 ***
## FEMALE -748.19 353.83 -2.115 0.034967 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3845 on 497 degrees of freedom
## Multiple R-squared: 0.0261, Adjusted R-squared: 0.02218
## F-statistic: 6.66 on 2 and 497 DF, p-value: 0.001399
#pvalue for age is very less this means it is a important factor in the hospital costs as seen by the significance levels and p-values
#gender has also less p value means it is also having the impact on cost and same with intercept
modelm2<-lm(LOS~AGE+FEMALE+RACE)
summary(modelm2)
##
## Call:
## lm(formula = LOS ~ AGE + FEMALE + RACE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.22 -1.22 -0.85 0.15 37.78
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.94377 0.39318 7.487 3.25e-13 ***
## AGE -0.03960 0.02231 -1.775 0.0766 .
## FEMALE 0.37011 0.31024 1.193 0.2334
## RACE -0.09408 0.29312 -0.321 0.7484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.363 on 495 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.007898, Adjusted R-squared: 0.001886
## F-statistic: 1.314 on 3 and 495 DF, p-value: 0.2692
#except for the intercept.
#The very high p-value signifies that there is no linear relationship between the given variables.
#That is, with just the age, gender, and race, it is not possible to predict the los of a patient
modelm3<-lm(TOTCHG~ .,data=hospna)
summary(modelm3)
##
## Call:
## lm(formula = TOTCHG ~ ., data = hospna)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6377 -700 -174 122 43378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5218.6769 507.6475 10.280 < 2e-16 ***
## AGE 134.6949 17.4711 7.710 7.02e-14 ***
## FEMALE -390.6924 247.7390 -1.577 0.115
## LOS 743.1521 34.9225 21.280 < 2e-16 ***
## RACE -212.4291 227.9326 -0.932 0.352
## APRDRG -7.7909 0.6816 -11.430 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2613 on 493 degrees of freedom
## Multiple R-squared: 0.5536, Adjusted R-squared: 0.5491
## F-statistic: 122.3 on 5 and 493 DF, p-value: < 2.2e-16
#APRDRG also affect
#We can see that age and length of stay affect the total hospital cost.