df <- read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\rentadebicis.csv")
summary(df)
## hora dia mes año
## Min. : 0.00 Min. : 1.000 Min. : 1.000 Min. :2011
## 1st Qu.: 6.00 1st Qu.: 5.000 1st Qu.: 4.000 1st Qu.:2011
## Median :12.00 Median :10.000 Median : 7.000 Median :2012
## Mean :11.54 Mean : 9.993 Mean : 6.521 Mean :2012
## 3rd Qu.:18.00 3rd Qu.:15.000 3rd Qu.:10.000 3rd Qu.:2012
## Max. :23.00 Max. :19.000 Max. :12.000 Max. :2012
## estacion dia_de_la_semana asueto temperatura
## Min. :1.000 Min. :1.000 Min. :0.00000 Min. : 0.82
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:0.00000 1st Qu.:13.94
## Median :3.000 Median :4.000 Median :0.00000 Median :20.50
## Mean :2.507 Mean :4.014 Mean :0.02857 Mean :20.23
## 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:0.00000 3rd Qu.:26.24
## Max. :4.000 Max. :7.000 Max. :1.00000 Max. :41.00
## sensacion_termica humedad velocidad_del_viento
## Min. : 0.76 Min. : 0.00 Min. : 0.000
## 1st Qu.:16.66 1st Qu.: 47.00 1st Qu.: 7.002
## Median :24.24 Median : 62.00 Median :12.998
## Mean :23.66 Mean : 61.89 Mean :12.799
## 3rd Qu.:31.06 3rd Qu.: 77.00 3rd Qu.:16.998
## Max. :45.45 Max. :100.00 Max. :56.997
## rentas_de_no_registrados rentas_de_registrados rentas_totales
## Min. : 0.00 Min. : 0.0 Min. : 1.0
## 1st Qu.: 4.00 1st Qu.: 36.0 1st Qu.: 42.0
## Median : 17.00 Median :118.0 Median :145.0
## Mean : 36.02 Mean :155.6 Mean :191.6
## 3rd Qu.: 49.00 3rd Qu.:222.0 3rd Qu.:284.0
## Max. :367.00 Max. :886.0 Max. :977.0
Observaciones: 1. Los días llegan hasta 19 y no hasta 31 días.
regresion<-lm(rentas_totales ~ hora+dia+mes+año+estacion+dia_de_la_semana+ asueto+temperatura+sensacion_termica+humedad+velocidad_del_viento, data=df)
summary(regresion)
##
## Call:
## lm(formula = rentas_totales ~ hora + dia + mes + año + estacion +
## dia_de_la_semana + asueto + temperatura + sensacion_termica +
## humedad + velocidad_del_viento, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -305.52 -93.64 -27.70 61.85 649.10
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.661e+05 5.496e+03 -30.217 < 2e-16 ***
## hora 7.735e+00 2.070e-01 37.368 < 2e-16 ***
## dia 3.844e-01 2.482e-01 1.549 0.12150
## mes 9.996e+00 1.682e+00 5.943 2.89e-09 ***
## año 8.258e+01 2.732e+00 30.225 < 2e-16 ***
## estacion -7.774e+00 5.177e+00 -1.502 0.13324
## dia_de_la_semana 4.393e-01 6.918e-01 0.635 0.52545
## asueto -4.864e+00 8.365e+00 -0.582 0.56089
## temperatura 1.582e+00 1.038e+00 1.524 0.12752
## sensacion_termica 4.748e+00 9.552e-01 4.971 6.76e-07 ***
## humedad -2.115e+00 7.884e-02 -26.827 < 2e-16 ***
## velocidad_del_viento 5.582e-01 1.809e-01 3.086 0.00203 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 141.7 on 10874 degrees of freedom
## Multiple R-squared: 0.3891, Adjusted R-squared: 0.3885
## F-statistic: 629.6 on 11 and 10874 DF, p-value: < 2.2e-16
regresion<-lm(rentas_totales ~ hora+mes+año+sensacion_termica+humedad+velocidad_del_viento, data=df)
summary(regresion)
##
## Call:
## lm(formula = rentas_totales ~ hora + mes + año + sensacion_termica +
## humedad + velocidad_del_viento, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -308.60 -93.85 -28.34 61.05 648.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.662e+05 5.496e+03 -30.250 < 2e-16 ***
## hora 7.734e+00 2.070e-01 37.364 < 2e-16 ***
## mes 7.574e+00 4.207e-01 18.002 < 2e-16 ***
## año 8.266e+01 2.732e+00 30.258 < 2e-16 ***
## sensacion_termica 6.172e+00 1.689e-01 36.539 < 2e-16 ***
## humedad -2.121e+00 7.858e-02 -26.988 < 2e-16 ***
## velocidad_del_viento 6.208e-01 1.771e-01 3.506 0.000457 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 141.7 on 10879 degrees of freedom
## Multiple R-squared: 0.3886, Adjusted R-squared: 0.3883
## F-statistic: 1153 on 6 and 10879 DF, p-value: < 2.2e-16
datos<- data.frame(hora=11.54,mes=1:12,año=2013,sensacion_termica=23.66, humedad=61.89, velocidad_del_viento=12.799)
predict(regresion,datos)
## 1 2 3 4 5 6 7 8
## 273.6001 281.1738 288.7475 296.3213 303.8950 311.4687 319.0424 326.6161
## 9 10 11 12
## 334.1898 341.7635 349.3372 356.9110
bd <- read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\HousePriceData.csv")
View(bd)
summary(bd)
## Observation Dist_Taxi Dist_Market Dist_Hospital
## Min. : 1.0 Min. : 146 Min. : 1666 Min. : 3227
## 1st Qu.:237.0 1st Qu.: 6477 1st Qu.: 9367 1st Qu.:11302
## Median :469.0 Median : 8228 Median :11149 Median :13189
## Mean :468.4 Mean : 8235 Mean :11022 Mean :13091
## 3rd Qu.:700.0 3rd Qu.: 9939 3rd Qu.:12675 3rd Qu.:14855
## Max. :932.0 Max. :20662 Max. :20945 Max. :23294
##
## Carpet Builtup Parking City_Category
## Min. : 775 Min. : 932 Length:905 Length:905
## 1st Qu.: 1317 1st Qu.: 1579 Class :character Class :character
## Median : 1478 Median : 1774 Mode :character Mode :character
## Mean : 1511 Mean : 1794
## 3rd Qu.: 1654 3rd Qu.: 1985
## Max. :24300 Max. :12730
## NA's :7
## Rainfall House_Price
## Min. :-110.0 Min. : 1492000
## 1st Qu.: 600.0 1st Qu.: 4623000
## Median : 780.0 Median : 5860000
## Mean : 786.9 Mean : 6083992
## 3rd Qu.: 970.0 3rd Qu.: 7200000
## Max. :1560.0 Max. :150000000
##
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
count(bd,Parking, sort=TRUE)
## Parking n
## 1 Open 355
## 2 Not Provided 225
## 3 Covered 184
## 4 No Parking 141
count(bd, City_Category, sort=TRUE)
## City_Category n
## 1 CAT B 351
## 2 CAT A 320
## 3 CAT C 234
Observaciones: 1. El precio de la casa esta con datos atipicos. 2. Rainfall tiene valores negativos. 3. Carpet tiene 7NA.
# Cuántos NA tengo en la base de datos
sum(is.na(bd))
## [1] 7
# Cuántos NA tengo por variable
sapply(bd, function(x) sum(is.na(x)))
## Observation Dist_Taxi Dist_Market Dist_Hospital Carpet
## 0 0 0 0 7
## Builtup Parking City_Category Rainfall House_Price
## 0 0 0 0 0
# Eliminar NA
bd<-na.omit(bd)
# Eliminar registro del precio atípico
bd<-bd[bd$House_Price<12000000,]
# Eliminar registro de Rainfall negativo
bd<-bd[bd$Rainfall>=0,]
# Gráfica
boxplot(bd$House_Price,horizontal=TRUE)
regresion2<-lm(House_Price ~ Dist_Taxi+Dist_Market+Dist_Hospital+Carpet+Builtup+Parking+ City_Category+Rainfall, data=bd)
summary(regresion2)
##
## Call:
## lm(formula = House_Price ~ Dist_Taxi + Dist_Market + Dist_Hospital +
## Carpet + Builtup + Parking + City_Category + Rainfall, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3572009 -800792 -65720 761534 4401585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.599e+06 3.672e+05 15.246 < 2e-16 ***
## Dist_Taxi 3.009e+01 2.682e+01 1.122 0.2622
## Dist_Market 1.285e+01 2.081e+01 0.618 0.5370
## Dist_Hospital 4.864e+01 3.008e+01 1.617 0.1062
## Carpet -7.997e+02 3.476e+03 -0.230 0.8181
## Builtup 1.339e+03 2.901e+03 0.462 0.6444
## ParkingNo Parking -6.040e+05 1.389e+05 -4.348 1.53e-05 ***
## ParkingNot Provided -4.924e+05 1.235e+05 -3.988 7.22e-05 ***
## ParkingOpen -2.632e+05 1.126e+05 -2.338 0.0196 *
## City_CategoryCAT B -1.877e+06 9.598e+04 -19.554 < 2e-16 ***
## City_CategoryCAT C -2.890e+06 1.059e+05 -27.300 < 2e-16 ***
## Rainfall -1.175e+02 1.550e+02 -0.758 0.4484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1222000 on 884 degrees of freedom
## Multiple R-squared: 0.5007, Adjusted R-squared: 0.4945
## F-statistic: 80.58 on 11 and 884 DF, p-value: < 2.2e-16
regresion2<-lm(House_Price ~ Dist_Taxi+Dist_Market+Dist_Hospital+Carpet+Builtup+Parking+ City_Category+Rainfall, data=bd)
summary(regresion2)
##
## Call:
## lm(formula = House_Price ~ Dist_Taxi + Dist_Market + Dist_Hospital +
## Carpet + Builtup + Parking + City_Category + Rainfall, data = bd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3572009 -800792 -65720 761534 4401585
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.599e+06 3.672e+05 15.246 < 2e-16 ***
## Dist_Taxi 3.009e+01 2.682e+01 1.122 0.2622
## Dist_Market 1.285e+01 2.081e+01 0.618 0.5370
## Dist_Hospital 4.864e+01 3.008e+01 1.617 0.1062
## Carpet -7.997e+02 3.476e+03 -0.230 0.8181
## Builtup 1.339e+03 2.901e+03 0.462 0.6444
## ParkingNo Parking -6.040e+05 1.389e+05 -4.348 1.53e-05 ***
## ParkingNot Provided -4.924e+05 1.235e+05 -3.988 7.22e-05 ***
## ParkingOpen -2.632e+05 1.126e+05 -2.338 0.0196 *
## City_CategoryCAT B -1.877e+06 9.598e+04 -19.554 < 2e-16 ***
## City_CategoryCAT C -2.890e+06 1.059e+05 -27.300 < 2e-16 ***
## Rainfall -1.175e+02 1.550e+02 -0.758 0.4484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1222000 on 884 degrees of freedom
## Multiple R-squared: 0.5007, Adjusted R-squared: 0.4945
## F-statistic: 80.58 on 11 and 884 DF, p-value: < 2.2e-16
En este caso no había necesidad de ajustar.
datos<- data.frame(Dist_Taxi=8278,Dist_Market=16251,Dist_Hospital=13857,Carpet=1455, Builtup=1764, Parking="Covered",City_Category="CAT A", Rainfall=390)
predict(regresion2,datos)
## 1
## 7883860
titanic <- read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\titanic.csv")
summary(titanic)
## pclass survived name sex
## Min. :1.000 Min. :0.000 Length:1310 Length:1310
## 1st Qu.:2.000 1st Qu.:0.000 Class :character Class :character
## Median :3.000 Median :0.000 Mode :character Mode :character
## Mean :2.295 Mean :0.382
## 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :3.000 Max. :1.000
## NA's :1 NA's :1
## age sibsp parch ticket
## Min. : 0.1667 Min. :0.0000 Min. :0.000 Length:1310
## 1st Qu.:21.0000 1st Qu.:0.0000 1st Qu.:0.000 Class :character
## Median :28.0000 Median :0.0000 Median :0.000 Mode :character
## Mean :29.8811 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.0000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.0000 Max. :8.0000 Max. :9.000
## NA's :264 NA's :1 NA's :1
## fare cabin embarked boat
## Min. : 0.000 Length:1310 Length:1310 Length:1310
## 1st Qu.: 7.896 Class :character Class :character Class :character
## Median : 14.454 Mode :character Mode :character Mode :character
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :2
## body home.dest
## Min. : 1.0 Length:1310
## 1st Qu.: 72.0 Class :character
## Median :155.0 Mode :character
## Mean :160.8
## 3rd Qu.:256.0
## Max. :328.0
## NA's :1189
library(dplyr)
#count(titanic,name,sort=TRUE)
#count(titanic,sex,sort=TRUE)
#count(titanic,ticket,sort=TRUE)
#count(titanic,cabin,sort=TRUE)
#count(titanic,embarked,sort=TRUE)
#count(titanic,boat,sort=TRUE)
#count(titanic,home.dest,sort=TRUE)
Observaciones: 1. Tenemos NA en la base de datos. 2. Un par de nombres estan repetidos.
# Cambiar de nombre a la variable pclass
colnames(titanic)[1]<-"class"
# Extraer las variables de interés
Titanic <- titanic[,c("class","age", "sex","survived")]
# ¿Cuántos NA tengo en la base de datos?
sum(is.na(Titanic))
## [1] 266
# ¿Cuántos NA tengo por variable?
sapply(Titanic, function(x) sum(is.na(x)))
## class age sex survived
## 1 264 0 1
# Eliminar NA
Titanic <- na.omit(Titanic)
str(Titanic)
## 'data.frame': 1046 obs. of 4 variables:
## $ class : int 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num 29 0.917 2 30 25 ...
## $ sex : chr "female" "male" "female" "male" ...
## $ survived: int 1 1 0 0 0 1 1 0 1 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:264] 16 38 41 47 60 70 71 75 81 107 ...
## ..- attr(*, "names")= chr [1:264] "16" "38" "41" "47" ...
# Convertir las variables categoricas en factores.
Titanic$class<- as.factor(Titanic$class)
Titanic$sex<- as.factor(Titanic$sex)
Titanic$survived<- as.factor(Titanic$survived)
str(Titanic)
## 'data.frame': 1046 obs. of 4 variables:
## $ class : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : num 29 0.917 2 30 25 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:264] 16 38 41 47 60 70 71 75 81 107 ...
## ..- attr(*, "names")= chr [1:264] "16" "38" "41" "47" ...
# install.packages("spart")
library(rpart)
# install.packages("rpart.plot")
library(rpart.plot)
arbol <- rpart(formula=survived ~ ., data=Titanic)
arbol
## n= 1046
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1046 427 0 (0.59177820 0.40822180)
## 2) sex=male 658 135 0 (0.79483283 0.20516717)
## 4) age>=9.5 615 110 0 (0.82113821 0.17886179) *
## 5) age< 9.5 43 18 1 (0.41860465 0.58139535)
## 10) class=3 29 11 0 (0.62068966 0.37931034) *
## 11) class=1,2 14 0 1 (0.00000000 1.00000000) *
## 3) sex=female 388 96 1 (0.24742268 0.75257732)
## 6) class=3 152 72 0 (0.52631579 0.47368421)
## 12) age>=1.5 145 66 0 (0.54482759 0.45517241) *
## 13) age< 1.5 7 1 1 (0.14285714 0.85714286) *
## 7) class=1,2 236 16 1 (0.06779661 0.93220339) *
rpart.plot(arbol)
prp(arbol, extra=7)
cancer_de_mama <- read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\cancer_de_mama.csv")
summary(cancer_de_mama)
## diagnosis radius_mean texture_mean perimeter_mean
## Length:569 Min. : 6.981 Min. : 9.71 Min. : 43.79
## Class :character 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17
## Mode :character Median :13.370 Median :18.84 Median : 86.24
## Mean :14.127 Mean :19.29 Mean : 91.97
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10
## Max. :28.110 Max. :39.28 Max. :188.50
## area_mean smoothness_mean compactness_mean concavity_mean
## Min. : 143.5 Min. :0.05263 Min. :0.01938 Min. :0.00000
## 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956
## Median : 551.1 Median :0.09587 Median :0.09263 Median :0.06154
## Mean : 654.9 Mean :0.09636 Mean :0.10434 Mean :0.08880
## 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070
## Max. :2501.0 Max. :0.16340 Max. :0.34540 Max. :0.42680
## concave.points_mean symmetry_mean fractal_dimension_mean radius_se
## Min. :0.00000 Min. :0.1060 Min. :0.04996 Min. :0.1115
## 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324
## Median :0.03350 Median :0.1792 Median :0.06154 Median :0.3242
## Mean :0.04892 Mean :0.1812 Mean :0.06280 Mean :0.4052
## 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789
## Max. :0.20120 Max. :0.3040 Max. :0.09744 Max. :2.8730
## texture_se perimeter_se area_se smoothness_se
## Min. :0.3602 Min. : 0.757 Min. : 6.802 Min. :0.001713
## 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169
## Median :1.1080 Median : 2.287 Median : 24.530 Median :0.006380
## Mean :1.2169 Mean : 2.866 Mean : 40.337 Mean :0.007041
## 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146
## Max. :4.8850 Max. :21.980 Max. :542.200 Max. :0.031130
## compactness_se concavity_se concave.points_se symmetry_se
## Min. :0.002252 Min. :0.00000 Min. :0.000000 Min. :0.007882
## 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160
## Median :0.020450 Median :0.02589 Median :0.010930 Median :0.018730
## Mean :0.025478 Mean :0.03189 Mean :0.011796 Mean :0.020542
## 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480
## Max. :0.135400 Max. :0.39600 Max. :0.052790 Max. :0.078950
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
## Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
## Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
## 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
## Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000
## 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145
## Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267
## Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722
## 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829
## Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520
## concave.points_worst symmetry_worst fractal_dimension_worst
## Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.29100 Max. :0.6638 Max. :0.20750
library(dplyr)
count(cancer_de_mama,diagnosis,sort=TRUE)
## diagnosis n
## 1 B 357
## 2 M 212
Observaciones: Ninguna, ya que la base de datos es adecuada.
# install.packages("spart")
library(rpart)
# install.packages("rpart.plot")
library(rpart.plot)
arbol <- rpart(formula=diagnosis ~ ., data=cancer_de_mama)
arbol
## n= 569
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 569 212 B (0.62741652 0.37258348)
## 2) radius_worst< 16.795 379 33 B (0.91292876 0.08707124)
## 4) concave.points_worst< 0.1358 333 5 B (0.98498498 0.01501502) *
## 5) concave.points_worst>=0.1358 46 18 M (0.39130435 0.60869565)
## 10) texture_worst< 25.67 19 4 B (0.78947368 0.21052632) *
## 11) texture_worst>=25.67 27 3 M (0.11111111 0.88888889) *
## 3) radius_worst>=16.795 190 11 M (0.05789474 0.94210526) *
rpart.plot(arbol)
prp(arbol, extra=7)
# 1. Crear base de datos
df<- data.frame(x=c(2,2,8,5,7,6,1,4),
y=c(10,5,4,8,5,4,2,9))
# 2. Determinar el número de grupos
grupos <- 3
# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)
segmentos
## K-means clustering with 3 clusters of sizes 3, 3, 2
##
## Cluster means:
## x y
## 1 7.000000 4.333333
## 2 3.666667 9.000000
## 3 1.500000 3.500000
##
## Clustering vector:
## [1] 2 3 1 2 1 1 3 2
##
## Within cluster sum of squares by cluster:
## [1] 2.666667 6.666667 5.000000
## (between_SS / total_SS = 85.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)
asignacion
## x y cluster
## 1 2 10 2
## 2 2 5 3
## 3 8 4 1
## 4 5 8 2
## 5 7 5 1
## 6 6 4 1
## 7 1 2 3
## 8 4 9 2
# 5. Graficar resultados
library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(segmentos, data=df,
palette=c("red","blue","darkgreen"),
ellipse.type="euclip",
star.plot=T,
repel=T,
ggtheme=theme()
)
# 6. Optimizar cantidad de grupos
library(cluster)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max=7)
plot(optimizacion, xlab="No. de clusters k")
# El punto mas alto de la gráfica indica la cantidad de grupos óptimo en los que se puede clasificar los datos.
bd<-read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\ventas.csv")
summary(bd)
## BillNo Itemname Quantity Date
## Length:522064 Length:522064 Min. :-9600.00 Length:522064
## Class :character Class :character 1st Qu.: 1.00 Class :character
## Mode :character Mode :character Median : 3.00 Mode :character
## Mean : 10.09
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Hour Price CustomerID Country
## Length:522064 Min. :-11062.060 Min. :12346 Length:522064
## Class :character 1st Qu.: 1.250 1st Qu.:13950 Class :character
## Mode :character Median : 2.080 Median :15265 Mode :character
## Mean : 3.827 Mean :15317
## 3rd Qu.: 4.130 3rd Qu.:16837
## Max. : 13541.330 Max. :18287
## NA's :134041
## Total
## Min. :-11062.06
## 1st Qu.: 3.75
## Median : 9.78
## Mean : 19.69
## 3rd Qu.: 17.40
## Max. :168469.60
##
library(dplyr)
#count(bd,BillNo, sort = TRUE)
#count(bd,Itemname, sort = TRUE)
#count(bd,Date, sort = TRUE)
#count(bd,Hour, sort = TRUE)
#count(bd,Country, sort = TRUE)
Observaciones:
# Cuantos NA tengo en la base de datos
sum(is.na(bd))
## [1] 134041
# Cuantos NA tento por variable
sapply(bd, function(x) sum(is.na(x)))
## BillNo Itemname Quantity Date Hour Price CustomerID
## 0 0 0 0 0 0 134041
## Country Total
## 0 0
#Eliminar NA
bd <- na.omit(bd)
# Eliminar totales negativos
bd <- bd[bd$Total>0,]
# Identificar outliers
boxplot(bd$Total, horizontal=TRUE)
Observaciones: 4. Tenemos outliers en Total.
# Obtener cantidad de visitas por cliente
Visitas<- group_by(bd,CustomerID) %>%
summarize(Visitas=n_distinct(BillNo))
# Obtener el total por ticket
ticket_promedio <- aggregate(Total ~ CustomerID+BillNo, data=bd, sum)
# Obtener el ticket promedio
ticket_promedio <- aggregate(Total ~ CustomerID, data=ticket_promedio, mean)
# Juntar las tablas Visitas y Ticket Promedio
objetos<- merge(Visitas, ticket_promedio, by="CustomerID")
# Llamar a los renglones como CustomerID
rownames(objetos) <- objetos$CustomerID
# Eliminar columna CustomerID
objetos<-subset(objetos, select=-c(CustomerID))
# Eliminar datos fuera de los normal
# Los datos fuera de lo normal estan fuera de los siguientes limites:
#Limite inferior = Q1 - 1.5*IQR
#Limite superior = Q3 + 1.5*IQR
#Q1:Cuartil 1, Q3: Cuartil 3, IQR=Rango Intercuartil
# Columna Visitas
IQR_V <- IQR(objetos$Visitas)
IQR_V
## [1] 4
summary(objetos)
## Visitas Total
## Min. : 1.000 Min. : 3.45
## 1st Qu.: 1.000 1st Qu.: 178.30
## Median : 2.000 Median : 292.00
## Mean : 4.227 Mean : 415.62
## 3rd Qu.: 5.000 3rd Qu.: 426.63
## Max. :209.000 Max. :84236.25
LI_V <- 1-1.5*IQR_V
LI_V
## [1] -5
LS_V <- 5+1.5*IQR_V
LS_V
## [1] 11
objetos <- objetos[objetos$Visitas <=11,]
# Renombrar columnas
colnames(objetos)<-c("Visitas", "TicketPromedio")
# Ticket Promedio
IQR_TP<- IQR(objetos$TicketPromedio)
IQR_TP
## [1] 243.3733
LI_TP <- 178.30 - 1.5*IQR_TP
LI_TP
## [1] -186.76
LS_TP<- 426.63 + 1.5*IQR_TP
LS_TP
## [1] 791.69
objetos <- objetos[objetos$TicketPromedio <=791.69,]
# 1. Crear base de datos
df<- objetos
# 2. Determinar el número de grupos
grupos <- 4
# 3. Realizar la clasificación
segmentos <- kmeans(df,grupos)
# segmentos
# 4. Revisar la asignación de grupos
asignacion <- cbind(df, cluster=segmentos$cluster)
# asignacion
# 5. Graficar resultados
library(ggplot2)
library(factoextra)
# fviz_cluster(segmentos, data=df,
# palette=c("red","blue","darkgreen", "yellow"),
# ellipse.type="euclip",
# star.plot=T,
# repel=T,
# ggtheme=theme()
# )
# 6. Optimizar cantidad de grupos
library(cluster)
library(data.table)
set.seed(123)
optimizacion <- clusGap(df, FUN=kmeans, nstart=1, K.max=7)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 188200)
plot(optimizacion, xlab="No. de clusters k")
Importamos las librerías
library (tidyverse)
library (foreign)
library (ggplot2)
library(dplyr)
library(scales)
library(ggrepel)
library(readr)
library(readxl)
library(rpart)
library(rpart.plot)
library(factoextra)
Importamos las Bases de Datos
claims <- read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\Gastos Médicos\\ClaimsData2018 (1).csv")
transactions <- read.csv("C:\\Users\\ca\\Desktop\\R Studio\\Modulo 4\\Gastos Médicos\\TransactionsSummary2018.csv")
datos <- as.data.frame(claims)
Observaciones
summary(claims)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0.00
## 1st Qu.: 811125 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 844626 Median : 235 Median : 0 Median : 0.00
## Mean :10149151 Mean : 6746 Mean : 2233 Mean : 68.88
## 3rd Qu.:22716506 3rd Qu.: 938 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203891 Max. :4527291 Max. :2069575 Max. :130541.03
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:134004 Length:134004
## 1st Qu.: 0 1st Qu.: 58 Class :character Class :character
## Median : 0 Median : 230 Mode :character Mode :character
## Mean : 3061 Mean : 3685
## 3rd Qu.: 0 3rd Qu.: 855
## Max. :640732 Max. :4129915
## IncidentDescription ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Length:134004 Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Min. :0.00000 Length:134004 Length:134004 Length:134004
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.04474
## 3rd Qu.:0.00000
## Max. :1.00000
## Injuryture BodyPartRegion BodyPart
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
summary(transactions)
## ClaimID BillReviewALE Hospital PhysicianOutpatient
## Min. : 633915 Min. : -456.0 Min. : -12570.4 Min. : -4655.7
## 1st Qu.: 793224 1st Qu.: 16.0 1st Qu.: 193.9 1st Qu.: 107.6
## Median : 828263 Median : 32.0 Median : 559.1 Median : 221.6
## Mean :10112865 Mean : 191.2 Mean : 4394.7 Mean : 1752.3
## 3rd Qu.:22700126 3rd Qu.: 80.0 3rd Qu.: 2253.4 3rd Qu.: 710.5
## Max. :62246496 Max. :56475.3 Max. :2759604.0 Max. :1481468.5
## NA's :61857 NA's :67254 NA's :6978
## Rx
## Min. : -469.5
## 1st Qu.: 23.3
## Median : 58.3
## Mean : 1140.4
## 3rd Qu.: 174.5
## Max. :631635.5
## NA's :67744
Modificamos el tipo de variable, de caracter a fecha
claims$ClaimantOpenedDate<-as.Date(claims$ClaimantOpenedDate,format="%d/%m/%Y")
claims$ClaimantClosedDate<-as.Date(claims$ClaimantClosedDate,format="%d/%m/%Y")
claims$IncidentDate<-as.Date(claims$IncidentDate,format="%d/%m/%Y")
claims$ReturnToWorkDate<-as.Date(claims$ReturnToWorkDate,format="%d/%m/%Y")
claims$EmployerNotificationDate<-as.Date(claims$EmployerNotificationDate,format="%d/%m/%Y")
claims$ReceivedDate<-as.Date(claims$ReceivedDate,format="%d/%m/%Y")
Convertimos el tipo de variable de caracter a numerica
claims<-claims%>%mutate(AverageWeeklyWage=as.numeric(AverageWeeklyWage))
claims<-claims%>%mutate(ClaimantAge_at_DOI=as.numeric(ClaimantAge_at_DOI))
Eliminamos las edades iguales o menores a cero
claims <- subset(claims, ClaimantAge_at_DOI>= 0)
Unificamos las bases de datos con un left join en un nuevo df
trans_claims<-claims%>%left_join(transactions,by="ClaimID")
summary(trans_claims)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -22 Min. : 0 Min. : 0.00
## 1st Qu.: 814283 1st Qu.: 82 1st Qu.: 0 1st Qu.: 0.00
## Median : 845571 Median : 281 Median : 0 Median : 0.00
## Mean :12146395 Mean : 8368 Mean : 3365 Mean : 68.22
## 3rd Qu.:22725794 3rd Qu.: 1229 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203891 Max. :4527291 Max. :2069575 Max. :130541.03
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -22 Length:88833 Min. :1968-09-27
## 1st Qu.: 0 1st Qu.: 79 Class :character 1st Qu.:2003-08-05
## Median : 0 Median : 275 Mode :character Median :2007-02-27
## Mean : 3698 Mean : 4670 Mean :2006-10-11
## 3rd Qu.: 0 3rd Qu.: 1126 3rd Qu.:2010-11-17
## Max. :640732 Max. :4129915 Max. :2014-06-27
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:88833 Min. :1984-01-01 Min. : 0.0
## Class :character 1st Qu.:2005-12-09 1st Qu.: 300.0
## Mode :character Median :2009-04-29 Median : 502.6
## Mean :2008-07-19 Mean : 629.0
## 3rd Qu.:2012-02-14 3rd Qu.: 684.8
## Max. :2015-05-07 Max. :2024000.0
## NA's :30493 NA's :50480
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1968-09-27 Min. :1999-06-01 Min. :1972-09-10
## 1st Qu.:2003-09-18 1st Qu.:2005-10-04 1st Qu.:2004-03-31
## Median :2007-04-18 Median :2008-12-11 Median :2007-09-26
## Mean :2006-11-22 Mean :2008-10-09 Mean :2007-10-07
## 3rd Qu.:2010-12-13 3rd Qu.:2011-06-10 3rd Qu.:2011-06-01
## Max. :2014-06-30 Max. :2014-06-30 Max. :9988-02-21
## NA's :4637 NA's :11756
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Min. :1968-09-27 Min. :0.00000 Min. : 0.00 Length:88833
## 1st Qu.:2003-09-16 1st Qu.:0.00000 1st Qu.:33.00 Class :character
## Median :2007-03-28 Median :0.00000 Median :42.00 Mode :character
## Mean :2007-01-28 Mean :0.05227 Mean :42.05
## 3rd Qu.:2010-12-17 3rd Qu.:0.00000 3rd Qu.:51.00
## Max. :5202-01-10 Max. :1.00000 Max. :94.00
##
## ClaimantType Injuryture BodyPartRegion BodyPart
## Length:88833 Length:88833 Length:88833 Length:88833
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BillReviewALE Hospital PhysicianOutpatient Rx
## Min. : -448.0 Min. : -12570.4 Min. : -549.5 Min. : -160.7
## 1st Qu.: 16.0 1st Qu.: 205.4 1st Qu.: 105.8 1st Qu.: 22.4
## Median : 24.0 Median : 600.9 Median : 217.3 Median : 60.1
## Mean : 188.4 Mean : 5005.3 Mean : 1773.7 Mean : 1625.4
## 3rd Qu.: 64.0 3rd Qu.: 2298.7 3rd Qu.: 673.7 3rd Qu.: 183.0
## Max. :46055.3 Max. :2759604.0 Max. :1219766.6 Max. :631635.5
## NA's :70866 NA's :75592 NA's :56446 NA's :76768
head(trans_claims)
## ClaimID TotalPaid TotalReserves TotalRecovery IndemnityPaid OtherPaid
## 1 650915 11947.55 0 0 243.65 11703.90
## 2 650916 0.00 0 0 0.00 0.00
## 3 650917 9295.89 0 0 0.00 9295.89
## 4 650918 1026.29 0 0 0.00 1026.29
## 5 650919 43108.03 0 0 40000.00 3108.03
## 6 650920 331.90 0 0 0.00 331.90
## ClaimStatus IncidentDate
## 1 C 2009-06-17
## 2 C 2009-06-26
## 3 C 2009-06-25
## 4 C 2009-06-12
## 5 C 2009-06-29
## 6 C 2009-06-25
## IncidentDescription
## 1 Employee was moving concrete rings and installing a meter. He strained lower back.
## 2 Employee was pulling lining. He felt a pop in the back causing a strain.
## 3 Employee was in the restroom. He heard a scream from another restroom that startled him and he fell on his left hand causing a fracture. ()
## 4 Employee was unloading truck using a pallet jack to unload heavy equipment. Heavy load caused him to lose balance, and he struck his right shoulder on door jamb causing unspecified injury. ()
## 5 Employee was leaning over to pick up a piece of paper. She fell out of rolling chair causing pain to lower back. Dx: lower back strain/buldging disc. ()
## 6 Employee was performing general lawn care maintence and outdoor tours and educatiol programs. He sustained a bite of right calf causing infection.
## ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate ClaimantClosedDate
## 1 2009-12-08 639.59 2009-07-02 2010-07-20
## 2 2009-06-26 NA 2009-07-02 2009-11-25
## 3 2009-07-13 1649.00 2009-07-02 2010-03-30
## 4 2009-06-12 NA 2009-07-02 2010-03-29
## 5 <NA> 539.00 2009-07-02 2011-05-06
## 6 2009-06-25 NA 2009-07-02 2009-11-25
## EmployerNotificationDate ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## 1 2009-06-29 2009-07-02 0 49 Male
## 2 2009-07-01 2009-07-02 0 49 Male
## 3 2009-06-25 2009-07-02 0 47 Male
## 4 2009-06-23 2009-07-02 0 61 Male
## 5 2009-07-01 2009-07-02 0 41 Female
## 6 2009-06-25 2009-07-02 0 28 Male
## ClaimantType Injuryture BodyPartRegion BodyPart BillReviewALE
## 1 Indemnity Strain Trunk Lower Back Area NA
## 2 Medical Only Strain Trunk Lower Back Area NA
## 3 Indemnity Fracture Upper Extremities Hand NA
## 4 Medical Only Contusion Upper Extremities Shoulder(S) NA
## 5 Indemnity Strain Trunk Lower Back Area NA
## 6 Medical Only Puncture Lower Extremities Lower Leg NA
## Hospital PhysicianOutpatient Rx
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
Nuevo df para la Regresión lineal, tomando solo las variables que nos interesan. Cálculo de días totales de indemnización
df <- as.data.frame(claims)
df$Days <- as.numeric(difftime(df$ClaimantClosedDate, df$ClaimantOpenedDate, units = "days"))
df <- df[, c("TotalPaid", "Gender", "Days", "ClaimantType", "Injuryture", "BodyPartRegion", "BodyPart")]
df <- na.omit(df)
Regresión Lineal
regresion <- lm(TotalPaid ~ Gender + Days + ClaimantType
+ Injuryture + BodyPartRegion + BodyPart, data= df)
#summary(regresion)
Ajustar Modelo
df <- subset(df,
ClaimantType == "Medical Only" |
ClaimantType == "Report Only" |
BodyPart %in% c("Body Systems and Multiple Body Systems", "Brain", "Disc-Trunk", "Eyes",
"Insufficient Info to Properly Identify?Unclassified", "Lower Back Area",
"Lumbar and/or Sacral Vertebrae (Vertebra NOC Trunk)",
"Multiple Body Parts (Including Body Systems and Body Parts)",
"No Physical Injury", "Shoulder(S)")
)
Modelo Ajustado
regresion <- lm(TotalPaid ~ ClaimantType + Days + Injuryture + BodyPart, data= df)
#summary(regresion)
Modelo Predictivo
datos_m <- data.frame(Days = 100, ClaimantType= "Indemnity", Injuryture = "Strain", BodyPart = "Vertebrae")
predict(regresion,datos_m)
## 1
## 21685.6
Exploramos los datos
summary(datos)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -270 Min. : 0 Min. : 0.00
## 1st Qu.: 811125 1st Qu.: 60 1st Qu.: 0 1st Qu.: 0.00
## Median : 844626 Median : 235 Median : 0 Median : 0.00
## Mean :10149151 Mean : 6746 Mean : 2233 Mean : 68.88
## 3rd Qu.:22716506 3rd Qu.: 938 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203891 Max. :4527291 Max. :2069575 Max. :130541.03
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -7820 Length:134004 Length:134004
## 1st Qu.: 0 1st Qu.: 58 Class :character Class :character
## Median : 0 Median : 230 Mode :character Mode :character
## Mean : 3061 Mean : 3685
## 3rd Qu.: 0 3rd Qu.: 855
## Max. :640732 Max. :4129915
## IncidentDescription ReturnToWorkDate AverageWeeklyWage ClaimantOpenedDate
## Length:134004 Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## ClaimantClosedDate EmployerNotificationDate ReceivedDate
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## IsDenied ClaimantAge_at_DOI Gender ClaimantType
## Min. :0.00000 Length:134004 Length:134004 Length:134004
## 1st Qu.:0.00000 Class :character Class :character Class :character
## Median :0.00000 Mode :character Mode :character Mode :character
## Mean :0.04474
## 3rd Qu.:0.00000
## Max. :1.00000
## Injuryture BodyPartRegion BodyPart
## Length:134004 Length:134004 Length:134004
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
#count(datos,ClaimID, sort=TRUE)
#count(datos,TotalPaid, sort=TRUE)
#count(datos,TotalReserves, sort=TRUE)
#count(datos,TotalRecovery, sort=TRUE)
#count(datos,IndemnityPaid, sort=TRUE)
#count(datos,OtherPaid, sort=TRUE)
#count(datos,ClaimStatus, sort=TRUE)
#count(datos,IncidentDate, sort=TRUE)
#count(datos,IncidentDescription, sort=TRUE)
#count(datos,ReturnToWorkDate, sort=TRUE)
#count(datos,AverageWeeklyWage, sort=TRUE)
#count(datos,ClaimantOpenedDate, sort=TRUE)
#count(datos,ClaimantClosedDate, sort=TRUE)
#count(datos,Gender, sort=TRUE)
#count(datos,ClaimantType, sort=TRUE)
#count(datos,Injuryture, sort=TRUE)
#count(datos,BodyPartRegion, sort=TRUE)
#count(datos,BodyPart, sort=TRUE)
Observaciones:
1. Tenemos NAs en la base de datos
2. En el género hay una opción de no disponible
Limpiamos los datos
#Seleccionamos los datos que nos interesan
ad<- datos[,c("BodyPartRegion","Gender","IsDenied","ClaimStatus")]
#Cuanto Nas tenemos
sum(is.na(ad))
## [1] 0
#Cuantos por variable
sapply(ad,function(x) sum(is.na(x)))
## BodyPartRegion Gender IsDenied ClaimStatus
## 0 0 0 0
#eliminar NA
ad <- na.omit(ad)
str(ad)
## 'data.frame': 134004 obs. of 4 variables:
## $ BodyPartRegion: chr "Trunk" "Trunk" "Upper Extremities" "Upper Extremities" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ IsDenied : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ClaimStatus : chr "C" "C" "C" "C" ...
#conertir las variables a factor las que sean categoricas
ad$BodyPartRegion <- as.factor(ad$BodyPartRegion)
ad$Gender <- as.factor(ad$Gender)
ad$IsDenied <- as.factor(ad$IsDenied)
str(ad)
## 'data.frame': 134004 obs. of 4 variables:
## $ BodyPartRegion: Factor w/ 8 levels "Head","Lower Extremities",..: 7 7 8 8 7 2 7 8 3 3 ...
## $ Gender : Factor w/ 3 levels "Female","Male",..: 2 2 2 2 1 2 1 2 1 2 ...
## $ IsDenied : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ClaimStatus : chr "C" "C" "C" "C" ...
Gráficamos el Árbol de Decisiones
arbol <- rpart(formula = Gender ~ ., data= ad)
arbol
## n= 134004
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 134004 68879 Male (0.441755470 0.485992955 0.072251575)
## 2) BodyPartRegion=Head,Lower Extremities,Multiple Body Parts,Neck,Not Available,Trunk,Upper Extremities 121353 59449 Male (0.484001220 0.510115119 0.005883662)
## 4) BodyPartRegion=Lower Extremities,Multiple Body Parts,Neck,Not Available,Upper Extremities 87298 43365 Female (0.503253225 0.491202548 0.005544228)
## 8) IsDenied=1 4491 1874 Female (0.582720998 0.413048319 0.004230684) *
## 9) IsDenied=0 82807 41491 Female (0.498943326 0.495441207 0.005615467)
## 18) BodyPartRegion=Lower Extremities,Multiple Body Parts 41527 19959 Female (0.519372938 0.476003564 0.004623498) *
## 19) BodyPartRegion=Neck,Not Available,Upper Extremities 41280 20021 Male (0.478391473 0.514995155 0.006613372) *
## 5) BodyPartRegion=Head,Trunk 34055 15032 Male (0.434649831 0.558596388 0.006753781) *
## 3) BodyPartRegion=Non-Standard Code 12651 3683 Not Available (0.036518852 0.254604379 0.708876769) *
rpart.plot(arbol)
Nuevo df para el análisis de clusters
bd <- as.data.frame(claims)
Exploramos la base de datos
summary(bd)
## ClaimID TotalPaid TotalReserves TotalRecovery
## Min. : 650915 Min. : -22 Min. : 0 Min. : 0.00
## 1st Qu.: 814283 1st Qu.: 82 1st Qu.: 0 1st Qu.: 0.00
## Median : 845571 Median : 281 Median : 0 Median : 0.00
## Mean :12146395 Mean : 8368 Mean : 3365 Mean : 68.22
## 3rd Qu.:22725794 3rd Qu.: 1229 3rd Qu.: 0 3rd Qu.: 0.00
## Max. :62203891 Max. :4527291 Max. :2069575 Max. :130541.03
##
## IndemnityPaid OtherPaid ClaimStatus IncidentDate
## Min. : -475 Min. : -22 Length:88833 Min. :1968-09-27
## 1st Qu.: 0 1st Qu.: 79 Class :character 1st Qu.:2003-08-05
## Median : 0 Median : 275 Mode :character Median :2007-02-27
## Mean : 3698 Mean : 4670 Mean :2006-10-11
## 3rd Qu.: 0 3rd Qu.: 1126 3rd Qu.:2010-11-17
## Max. :640732 Max. :4129915 Max. :2014-06-27
##
## IncidentDescription ReturnToWorkDate AverageWeeklyWage
## Length:88833 Min. :1984-01-01 Min. : 0.0
## Class :character 1st Qu.:2005-12-09 1st Qu.: 300.0
## Mode :character Median :2009-04-29 Median : 502.6
## Mean :2008-07-19 Mean : 629.0
## 3rd Qu.:2012-02-14 3rd Qu.: 684.8
## Max. :2015-05-07 Max. :2024000.0
## NA's :30493 NA's :50480
## ClaimantOpenedDate ClaimantClosedDate EmployerNotificationDate
## Min. :1968-09-27 Min. :1999-06-01 Min. :1972-09-10
## 1st Qu.:2003-09-18 1st Qu.:2005-10-04 1st Qu.:2004-03-31
## Median :2007-04-18 Median :2008-12-11 Median :2007-09-26
## Mean :2006-11-22 Mean :2008-10-09 Mean :2007-10-07
## 3rd Qu.:2010-12-13 3rd Qu.:2011-06-10 3rd Qu.:2011-06-01
## Max. :2014-06-30 Max. :2014-06-30 Max. :9988-02-21
## NA's :4637 NA's :11756
## ReceivedDate IsDenied ClaimantAge_at_DOI Gender
## Min. :1968-09-27 Min. :0.00000 Min. : 0.00 Length:88833
## 1st Qu.:2003-09-16 1st Qu.:0.00000 1st Qu.:33.00 Class :character
## Median :2007-03-28 Median :0.00000 Median :42.00 Mode :character
## Mean :2007-01-28 Mean :0.05227 Mean :42.05
## 3rd Qu.:2010-12-17 3rd Qu.:0.00000 3rd Qu.:51.00
## Max. :5202-01-10 Max. :1.00000 Max. :94.00
##
## ClaimantType Injuryture BodyPartRegion BodyPart
## Length:88833 Length:88833 Length:88833 Length:88833
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
#count(bd,ClaimantOpenedDate,sort = TRUE)
#count(bd,ClaimantClosedDate,sort = TRUE)
Observaciones
1. ClaimID, TotalPaid, TotalReserves y TotalRecovery son variables de
tipo character y necesitan ser cambiados a numeric.
2. Existen negativos en TotalPaid.
Modificamos el tipo de variable
bd$TotalReserves<-as.numeric(bd$TotalReserves)
bd$TotalPaid<-as.numeric(bd$TotalPaid)
bd$TotalRecovery<-as.numeric(bd$TotalRecovery)
Eliminamos los negativos de total paid
bd <-bd[bd$TotalPaid>0,]
Realizamos cálculos importantes
# Obtener el Tiempo de Procesamiento en días.
bd<-mutate(bd, TiempoProcesamiento=difftime(bd$ClaimantClosedDate,bd$ClaimantOpenedDate, units="days"))
# Obtener el Costo Total
bd <- mutate(bd,CostoTotal=TotalReserves+TotalPaid-TotalRecovery)
Creamos una tabla nueva con las variables de interes: Tiempo de Procesamiento, Costo Total y ClaimID
nueva_bd<-select(bd,ClaimID,TiempoProcesamiento,CostoTotal)
Realizamos las modificaciones necesarias a nuestro data frame
# Cambiar el tipo de variable TiempoProcesamiento de difftime-numeric.
nueva_bd$TiempoProcesamiento <- as.numeric(nueva_bd$TiempoProcesamiento)
#Eliminamos los na
nueva_bd <- na.omit(nueva_bd)
Guardamos la columna de ID en una variable temporal y la eliminamos del conjunto de datos
ids <- nueva_bd$ClaimID
nueva_bd <- nueva_bd[, -1]
Determinamos el número de clusters optimo
wss <- numeric(10)
for (i in 1:10) {
kmeans_result <- kmeans(nueva_bd, centers = i)
wss[i] <- kmeans_result$tot.withinss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 3416650)
## Warning: did not converge in 10 iterations
plot(1:10, wss, type = "b", xlab = "Número de Clusters (k)", ylab = "Suma de Cuadrados Dentro del Clúster (WSS)")
Realizamos K-means
k_optimo <- 4
kmeans_result <- kmeans(nueva_bd, centers = k_optimo)
#kmeans_result
resultados <- data.frame(ID = ids, Cluster = kmeans_result$cluster)
head(resultados)
## ID Cluster
## 1 650915 2
## 3 650917 2
## 4 650918 2
## 5 650919 1
## 6 650920 2
## 8 650929 2
Visualizamos los resultados
fviz_cluster(kmeans_result, data = nueva_bd)