Titanic

Titanic
Titanic
library (tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library (foreign)
library (ggplot2)
library(dplyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggrepel)

File.Choose()

titanic<-read.csv("C://Users/IanAb/Documents/7to Semestre/DATA BASE/titanic.csv")

Titanic

summary(titanic)
##      pclass         survived         name               sex           
##  Min.   :1.000   Min.   :0.000   Length:1310        Length:1310       
##  1st Qu.:2.000   1st Qu.:0.000   Class :character   Class :character  
##  Median :3.000   Median :0.000   Mode  :character   Mode  :character  
##  Mean   :2.295   Mean   :0.382                                        
##  3rd Qu.:3.000   3rd Qu.:1.000                                        
##  Max.   :3.000   Max.   :1.000                                        
##  NA's   :1       NA's   :1                                            
##       age              sibsp            parch          ticket         
##  Min.   : 0.1667   Min.   :0.0000   Min.   :0.000   Length:1310       
##  1st Qu.:21.0000   1st Qu.:0.0000   1st Qu.:0.000   Class :character  
##  Median :28.0000   Median :0.0000   Median :0.000   Mode  :character  
##  Mean   :29.8811   Mean   :0.4989   Mean   :0.385                     
##  3rd Qu.:39.0000   3rd Qu.:1.0000   3rd Qu.:0.000                     
##  Max.   :80.0000   Max.   :8.0000   Max.   :9.000                     
##  NA's   :264       NA's   :1        NA's   :1                         
##       fare            cabin             embarked             boat          
##  Min.   :  0.000   Length:1310        Length:1310        Length:1310       
##  1st Qu.:  7.896   Class :character   Class :character   Class :character  
##  Median : 14.454   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 33.295                                                           
##  3rd Qu.: 31.275                                                           
##  Max.   :512.329                                                           
##  NA's   :2                                                                 
##       body        home.dest        
##  Min.   :  1.0   Length:1310       
##  1st Qu.: 72.0   Class :character  
##  Median :155.0   Mode  :character  
##  Mean   :160.8                     
##  3rd Qu.:256.0                     
##  Max.   :328.0                     
##  NA's   :1189
# count(titanic,name,sort=TRUE)

Observaciones: 1. Tneemos NA en la base de datos 2. Un par de de nombres están repetidos

# count(titanic,name,sort=TRUE)
# count(titanic,sex,sort=TRUE)
# count(titanic,ticket,sort=TRUE)
# count(titanic,cabin,sort=TRUE)
# count(titanic,embarked,sort=TRUE)
# count(titanic,boat,sort=TRUE)
# count(titanic,home.dest,sort=TRUE)

Limpiar la base de datos

Cambio del nombre de columnas

colnames(titanic)[1]<-"class"

Extraer las variables de interés

Titanic<-titanic[,c("class","age","sex","survived")]

cuantos NA tengo en la base de datos

sum(is.na(Titanic))
## [1] 266

¿Cuántos NA tengo por variable?

sapply(Titanic,function(x) sum (is.na(x)))
##    class      age      sex survived 
##        1      264        0        1

Eliminar NA

Titanic<-na.omit(Titanic)
str(Titanic)
## 'data.frame':    1046 obs. of  4 variables:
##  $ class   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ age     : num  29 0.917 2 30 25 ...
##  $ sex     : chr  "female" "male" "female" "male" ...
##  $ survived: int  1 1 0 0 0 1 1 0 1 0 ...
##  - attr(*, "na.action")= 'omit' Named int [1:264] 16 38 41 47 60 70 71 75 81 107 ...
##   ..- attr(*, "names")= chr [1:264] "16" "38" "41" "47" ...

Cambio de tipo de variable categorica a factor

Titanic$class<-as.factor(Titanic$class)
Titanic$sex<-as.factor(Titanic$sex)
Titanic$survived<-as.factor(Titanic$survived)
str(Titanic)
## 'data.frame':    1046 obs. of  4 variables:
##  $ class   : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age     : num  29 0.917 2 30 25 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:264] 16 38 41 47 60 70 71 75 81 107 ...
##   ..- attr(*, "names")= chr [1:264] "16" "38" "41" "47" ...

Crear el árbol de deciones

librerías a usar

library (rpart)
library(rpart.plot)
arbol<-rpart(formula=survived ~ .,data=Titanic)
arbol
## n= 1046 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1046 427 0 (0.59177820 0.40822180)  
##    2) sex=male 658 135 0 (0.79483283 0.20516717)  
##      4) age>=9.5 615 110 0 (0.82113821 0.17886179) *
##      5) age< 9.5 43  18 1 (0.41860465 0.58139535)  
##       10) class=3 29  11 0 (0.62068966 0.37931034) *
##       11) class=1,2 14   0 1 (0.00000000 1.00000000) *
##    3) sex=female 388  96 1 (0.24742268 0.75257732)  
##      6) class=3 152  72 0 (0.52631579 0.47368421)  
##       12) age>=1.5 145  66 0 (0.54482759 0.45517241) *
##       13) age< 1.5 7   1 1 (0.14285714 0.85714286) *
##      7) class=1,2 236  16 1 (0.06779661 0.93220339) *
rpart.plot(arbol)

prp(arbol,extra=7)

Cancer de Mama

Titanic
Titanic
library (tidyverse)
library (foreign)
library (ggplot2)
library(dplyr)
library(scales)
library(ggrepel)

File.Choose()

cancermama<-read.csv("C://Users/IanAb/Documents/7to Semestre/DATA BASE/cancer_de_mama.csv")

Entender la base de datos

summary(cancermama)
##   diagnosis          radius_mean      texture_mean   perimeter_mean  
##  Length:569         Min.   : 6.981   Min.   : 9.71   Min.   : 43.79  
##  Class :character   1st Qu.:11.700   1st Qu.:16.17   1st Qu.: 75.17  
##  Mode  :character   Median :13.370   Median :18.84   Median : 86.24  
##                     Mean   :14.127   Mean   :19.29   Mean   : 91.97  
##                     3rd Qu.:15.780   3rd Qu.:21.80   3rd Qu.:104.10  
##                     Max.   :28.110   Max.   :39.28   Max.   :188.50  
##    area_mean      smoothness_mean   compactness_mean  concavity_mean   
##  Min.   : 143.5   Min.   :0.05263   Min.   :0.01938   Min.   :0.00000  
##  1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492   1st Qu.:0.02956  
##  Median : 551.1   Median :0.09587   Median :0.09263   Median :0.06154  
##  Mean   : 654.9   Mean   :0.09636   Mean   :0.10434   Mean   :0.08880  
##  3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040   3rd Qu.:0.13070  
##  Max.   :2501.0   Max.   :0.16340   Max.   :0.34540   Max.   :0.42680  
##  concave.points_mean symmetry_mean    fractal_dimension_mean   radius_se     
##  Min.   :0.00000     Min.   :0.1060   Min.   :0.04996        Min.   :0.1115  
##  1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770        1st Qu.:0.2324  
##  Median :0.03350     Median :0.1792   Median :0.06154        Median :0.3242  
##  Mean   :0.04892     Mean   :0.1812   Mean   :0.06280        Mean   :0.4052  
##  3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612        3rd Qu.:0.4789  
##  Max.   :0.20120     Max.   :0.3040   Max.   :0.09744        Max.   :2.8730  
##    texture_se      perimeter_se       area_se        smoothness_se     
##  Min.   :0.3602   Min.   : 0.757   Min.   :  6.802   Min.   :0.001713  
##  1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850   1st Qu.:0.005169  
##  Median :1.1080   Median : 2.287   Median : 24.530   Median :0.006380  
##  Mean   :1.2169   Mean   : 2.866   Mean   : 40.337   Mean   :0.007041  
##  3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190   3rd Qu.:0.008146  
##  Max.   :4.8850   Max.   :21.980   Max.   :542.200   Max.   :0.031130  
##  compactness_se      concavity_se     concave.points_se   symmetry_se      
##  Min.   :0.002252   Min.   :0.00000   Min.   :0.000000   Min.   :0.007882  
##  1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638   1st Qu.:0.015160  
##  Median :0.020450   Median :0.02589   Median :0.010930   Median :0.018730  
##  Mean   :0.025478   Mean   :0.03189   Mean   :0.011796   Mean   :0.020542  
##  3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710   3rd Qu.:0.023480  
##  Max.   :0.135400   Max.   :0.39600   Max.   :0.052790   Max.   :0.078950  
##  fractal_dimension_se  radius_worst   texture_worst   perimeter_worst 
##  Min.   :0.0008948    Min.   : 7.93   Min.   :12.02   Min.   : 50.41  
##  1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08   1st Qu.: 84.11  
##  Median :0.0031870    Median :14.97   Median :25.41   Median : 97.66  
##  Mean   :0.0037949    Mean   :16.27   Mean   :25.68   Mean   :107.26  
##  3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72   3rd Qu.:125.40  
##  Max.   :0.0298400    Max.   :36.04   Max.   :49.54   Max.   :251.20  
##    area_worst     smoothness_worst  compactness_worst concavity_worst 
##  Min.   : 185.2   Min.   :0.07117   Min.   :0.02729   Min.   :0.0000  
##  1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720   1st Qu.:0.1145  
##  Median : 686.5   Median :0.13130   Median :0.21190   Median :0.2267  
##  Mean   : 880.6   Mean   :0.13237   Mean   :0.25427   Mean   :0.2722  
##  3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910   3rd Qu.:0.3829  
##  Max.   :4254.0   Max.   :0.22260   Max.   :1.05800   Max.   :1.2520  
##  concave.points_worst symmetry_worst   fractal_dimension_worst
##  Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
##  1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146        
##  Median :0.09993      Median :0.2822   Median :0.08004        
##  Mean   :0.11461      Mean   :0.2901   Mean   :0.08395        
##  3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208        
##  Max.   :0.29100      Max.   :0.6638   Max.   :0.20750
# count(cancermama,diagnosis,sort=TRUE)

Observaciones: 1. No tenemos NA en la base de datos 2. No requeire de una limpieza de datos

Crear el árbol de deciones

librerías a usar

library (rpart)
library(rpart.plot)
arbol2<-rpart(formula=diagnosis ~ .,data=cancermama)
arbol2
## n= 569 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 569 212 B (0.62741652 0.37258348)  
##    2) radius_worst< 16.795 379  33 B (0.91292876 0.08707124)  
##      4) concave.points_worst< 0.1358 333   5 B (0.98498498 0.01501502) *
##      5) concave.points_worst>=0.1358 46  18 M (0.39130435 0.60869565)  
##       10) texture_worst< 25.67 19   4 B (0.78947368 0.21052632) *
##       11) texture_worst>=25.67 27   3 M (0.11111111 0.88888889) *
##    3) radius_worst>=16.795 190  11 M (0.05789474 0.94210526) *
rpart.plot(arbol2)

prp(arbol2,extra=7)

LS0tDQp0aXRsZTogIkFjdGl2aWRhZCA0LjIgQXJib2wgZGUgRGVjaXNpb24iDQphdXRob3I6ICJJYW4gQWJyYWhhbSBRdWlyb3ogQ3VhcGlvIEEwMTczMTQxOCINCmRhdGU6ICIyMDIzLTA5LTE5Ig0Kb3V0cHV0Og0KICBodG1sX2RvY3VtZW50Og0KICAgIHRvYzogeWVzDQogICAgdG9jX2Zsb2F0OiB5ZXMNCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFDQogICAgdGhlbWU6ICJyZWFkYWJsZSINCiAgICBoaWdobGlnaHQ6ICJweWdtZW50cyINCmxhbmc6ICJlcy1FUyINCi0tLQ0KDQpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0NCmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkNCmBgYA0KDQoNCjxjZW50ZXI+DQojIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+IFRpdGFuaWMgPC9zcGFuPg0KPC9jZW50ZXI+DQoNCiFbVGl0YW5pY10oQzovVXNlcnMvSWFuQWIvRG9jdW1lbnRzLzd0byBTZW1lc3RyZS90aXRhbmljLmdpZikNCg0KDQpgYGB7cn0NCmxpYnJhcnkgKHRpZHl2ZXJzZSkNCmxpYnJhcnkgKGZvcmVpZ24pDQpsaWJyYXJ5IChnZ3Bsb3QyKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc2NhbGVzKQ0KbGlicmFyeShnZ3JlcGVsKQ0KYGBgDQoNCg0KRmlsZS5DaG9vc2UoKQ0KDQpgYGB7cn0NCnRpdGFuaWM8LXJlYWQuY3N2KCJDOi8vVXNlcnMvSWFuQWIvRG9jdW1lbnRzLzd0byBTZW1lc3RyZS9EQVRBIEJBU0UvdGl0YW5pYy5jc3YiKQ0KYGBgDQoNCjxjZW50ZXI+DQojIyMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWU7Ij4gVGl0YW5pYyA8L3NwYW4+DQoNCg0KYGBge3J9DQpzdW1tYXJ5KHRpdGFuaWMpDQojIGNvdW50KHRpdGFuaWMsbmFtZSxzb3J0PVRSVUUpDQpgYGANCg0KT2JzZXJ2YWNpb25lczoNCjEuIFRuZWVtb3MgTkEgZW4gbGEgYmFzZSBkZSBkYXRvcyANCjIuIFVuIHBhciBkZSBkZSBub21icmVzIGVzdMOhbiByZXBldGlkb3MNCg0KDQpgYGB7cn0NCiMgY291bnQodGl0YW5pYyxuYW1lLHNvcnQ9VFJVRSkNCiMgY291bnQodGl0YW5pYyxzZXgsc29ydD1UUlVFKQ0KIyBjb3VudCh0aXRhbmljLHRpY2tldCxzb3J0PVRSVUUpDQojIGNvdW50KHRpdGFuaWMsY2FiaW4sc29ydD1UUlVFKQ0KIyBjb3VudCh0aXRhbmljLGVtYmFya2VkLHNvcnQ9VFJVRSkNCiMgY291bnQodGl0YW5pYyxib2F0LHNvcnQ9VFJVRSkNCiMgY291bnQodGl0YW5pYyxob21lLmRlc3Qsc29ydD1UUlVFKQ0KDQpgYGANCg0KPGNlbnRlcj4NCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPiBMaW1waWFyIGxhIGJhc2UgZGUgZGF0b3MgPC9zcGFuPg0KDQoNCkNhbWJpbyBkZWwgbm9tYnJlIGRlIGNvbHVtbmFzDQoNCg0KYGBge3J9DQpjb2xuYW1lcyh0aXRhbmljKVsxXTwtImNsYXNzIg0KYGBgDQoNCkV4dHJhZXIgbGFzIHZhcmlhYmxlcyBkZSBpbnRlcsOpcw0KDQpgYGB7cn0NClRpdGFuaWM8LXRpdGFuaWNbLGMoImNsYXNzIiwiYWdlIiwic2V4Iiwic3Vydml2ZWQiKV0NCmBgYA0KDQpjdWFudG9zIE5BIHRlbmdvIGVuIGxhIGJhc2UgZGUgZGF0b3MgDQoNCmBgYHtyfQ0Kc3VtKGlzLm5hKFRpdGFuaWMpKQ0KYGBgDQoNCsK/Q3XDoW50b3MgTkEgdGVuZ28gcG9yIHZhcmlhYmxlPw0KYGBge3J9DQpzYXBwbHkoVGl0YW5pYyxmdW5jdGlvbih4KSBzdW0gKGlzLm5hKHgpKSkNCmBgYA0KDQpFbGltaW5hciBOQQ0KYGBge3J9DQpUaXRhbmljPC1uYS5vbWl0KFRpdGFuaWMpDQpzdHIoVGl0YW5pYykNCmBgYA0KDQpDYW1iaW8gZGUgdGlwbyBkZSB2YXJpYWJsZSBjYXRlZ29yaWNhIGEgZmFjdG9yDQoNCmBgYHtyfQ0KVGl0YW5pYyRjbGFzczwtYXMuZmFjdG9yKFRpdGFuaWMkY2xhc3MpDQpUaXRhbmljJHNleDwtYXMuZmFjdG9yKFRpdGFuaWMkc2V4KQ0KVGl0YW5pYyRzdXJ2aXZlZDwtYXMuZmFjdG9yKFRpdGFuaWMkc3Vydml2ZWQpDQpzdHIoVGl0YW5pYykNCmBgYA0KDQojIyMgPHNwYW4gc3R5bGU9ImNvbG9yOmJsdWU7Ij4gQ3JlYXIgZWwgw6FyYm9sIGRlIGRlY2lvbmVzIDwvc3Bhbj4NCg0KbGlicmVyw61hcyBhIHVzYXINCmBgYHtyfQ0KbGlicmFyeSAocnBhcnQpDQpsaWJyYXJ5KHJwYXJ0LnBsb3QpDQpgYGANCg0KDQpgYGB7cn0NCmFyYm9sPC1ycGFydChmb3JtdWxhPXN1cnZpdmVkIH4gLixkYXRhPVRpdGFuaWMpDQphcmJvbA0KcnBhcnQucGxvdChhcmJvbCkNCg0KcHJwKGFyYm9sLGV4dHJhPTcpDQpgYGANCg0KDQoNCmBgYHtyfQ0KDQpgYGANCg0KDQoNCjxjZW50ZXI+DQojIyAgPHNwYW4gc3R5bGU9ImNvbG9yOnBpbms7Ij4gQ2FuY2VyIGRlIE1hbWEgPC9zcGFuPg0KPC9jZW50ZXI+DQoNCiFbVGl0YW5pY10oQzovVXNlcnMvSWFuQWIvRG9jdW1lbnRzLzd0byBTZW1lc3RyZS9tYW1hLmdpZikNCg0KDQpgYGB7cn0NCmxpYnJhcnkgKHRpZHl2ZXJzZSkNCmxpYnJhcnkgKGZvcmVpZ24pDQpsaWJyYXJ5IChnZ3Bsb3QyKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc2NhbGVzKQ0KbGlicmFyeShnZ3JlcGVsKQ0KYGBgDQoNCg0KRmlsZS5DaG9vc2UoKQ0KDQpgYGB7cn0NCmNhbmNlcm1hbWE8LXJlYWQuY3N2KCJDOi8vVXNlcnMvSWFuQWIvRG9jdW1lbnRzLzd0byBTZW1lc3RyZS9EQVRBIEJBU0UvY2FuY2VyX2RlX21hbWEuY3N2IikNCmBgYA0KDQo8Y2VudGVyPg0KIyMjIDxzcGFuIHN0eWxlPSJjb2xvcjpjYWRldHBpbms7Ij4gRW50ZW5kZXIgbGEgYmFzZSBkZSBkYXRvcyA8L3NwYW4+DQoNCg0KYGBge3J9DQpzdW1tYXJ5KGNhbmNlcm1hbWEpDQojIGNvdW50KGNhbmNlcm1hbWEsZGlhZ25vc2lzLHNvcnQ9VFJVRSkNCmBgYA0KDQoNCk9ic2VydmFjaW9uZXM6DQoxLiBObyB0ZW5lbW9zIE5BIGVuIGxhIGJhc2UgZGUgZGF0b3MgDQoyLiBObyByZXF1ZWlyZSBkZSB1bmEgbGltcGllemEgZGUgZGF0b3MNCg0KDQojIyMgPHNwYW4gc3R5bGU9ImNvbG9yOnBpbms7Ij4gQ3JlYXIgZWwgw6FyYm9sIGRlIGRlY2lvbmVzIDwvc3Bhbj4NCg0KbGlicmVyw61hcyBhIHVzYXINCmBgYHtyfQ0KbGlicmFyeSAocnBhcnQpDQpsaWJyYXJ5KHJwYXJ0LnBsb3QpDQpgYGANCg0KDQpgYGB7cn0NCmFyYm9sMjwtcnBhcnQoZm9ybXVsYT1kaWFnbm9zaXMgfiAuLGRhdGE9Y2FuY2VybWFtYSkNCmFyYm9sMg0KcnBhcnQucGxvdChhcmJvbDIpDQoNCnBycChhcmJvbDIsZXh0cmE9NykNCmBgYA0KDQoNCg==