22 de febrero de 2017

Leer Datos

Dataset del Titanic

knitr::opts_chunk$set(echo = TRUE)
getwd()
## [1] "/Users/julietarodriguez/Downloads/Mini proyecto 1"
train <- read.csv("train.csv", stringsAsFactors = FALSE)

Muestra del dataset Titanic

Función head

head(train)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q

Función tail

tail(train)
##     PassengerId Survived Pclass                                     Name
## 886         886        0      3     Rice, Mrs. William (Margaret Norton)
## 887         887        0      2                    Montvila, Rev. Juozas
## 888         888        1      1             Graham, Miss. Margaret Edith
## 889         889        0      3 Johnston, Miss. Catherine Helen "Carrie"
## 890         890        1      1                    Behr, Mr. Karl Howell
## 891         891        0      3                      Dooley, Mr. Patrick
##        Sex Age SibSp Parch     Ticket   Fare Cabin Embarked
## 886 female  39     0     5     382652 29.125              Q
## 887   male  27     0     0     211536 13.000              S
## 888 female  19     0     0     112053 30.000   B42        S
## 889 female  NA     1     2 W./C. 6607 23.450              S
## 890   male  26     0     0     111369 30.000  C148        C
## 891   male  32     0     0     370376  7.750              Q

Estructura y dominio del dataset

Función str

str(train)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

Sumario del dataset

Summary del dataset

summary(train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 

Descripción de atributos del dataset

Sobrevivientes

table(train$Survived)
## 
##   0   1 
## 549 342

Clase del viaje

table(train$Pclass)
## 
##   1   2   3 
## 216 184 491

Género

table(train$Sex)
## 
## female   male 
##    314    577
table(train$Sex, train$Survived)
##         
##            0   1
##   female  81 233
##   male   468 109

"Parche"

table(train$Parch)
## 
##   0   1   2   3   4   5   6 
## 678 118  80   5   4   5   1

Tickets

length(unique(train$Ticket))
## [1] 681

Embarcación

#C = Cherbourg, Q = Queenstown(Belfast) y S = Southampton.
train$Embarked <- as.factor(train$Embarked)
table(train$Embarked)
## 
##       C   Q   S 
##   2 168  77 644

Estadísticas por atributos

Sobrevivientes

table(train$Survived)
## 
##   0   1 
## 549 342
summary(train$Survived)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3838  1.0000  1.0000

Clase

table(train$Pclass)
## 
##   1   2   3 
## 216 184 491
summary(train$Pclass)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.309   3.000   3.000

"Parche", padres/hijos a bordo

table(train$Parch)
## 
##   0   1   2   3   4   5   6 
## 678 118  80   5   4   5   1
summary(train$Parch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3816  0.0000  6.0000

Tickets compartidos

ticket <- train[train$Ticket %in% names(table(train$Ticket))[table(train$Ticket) > 5],]
ticket[order(ticket$Ticket),]$Name
##  [1] "Bing, Mr. Lee"                                            
##  [2] "Ling, Mr. Lee"                                            
##  [3] "Lang, Mr. Fang"                                           
##  [4] "Foo, Mr. Choong"                                          
##  [5] "Lam, Mr. Ali"                                             
##  [6] "Lam, Mr. Len"                                             
##  [7] "Chip, Mr. Chang"                                          
##  [8] "Panula, Master. Juha Niilo"                               
##  [9] "Panula, Master. Eino Viljami"                             
## [10] "Panula, Mr. Ernesti Arvid"                                
## [11] "Panula, Mrs. Juha (Maria Emilia Ojala)"                   
## [12] "Panula, Mr. Jaako Arnold"                                 
## [13] "Panula, Master. Urho Abraham"                             
## [14] "Andersson, Mr. Anders Johan"                              
## [15] "Andersson, Miss. Ellis Anna Maria"                        
## [16] "Andersson, Miss. Ingeborg Constanzia"                     
## [17] "Andersson, Miss. Sigrid Elisabeth"                        
## [18] "Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)"
## [19] "Andersson, Miss. Ebba Iris Alfrida"                       
## [20] "Andersson, Master. Sigvard Harald Elias"                  
## [21] "Skoog, Master. Harald"                                    
## [22] "Skoog, Mrs. William (Anna Bernhardina Karlsson)"          
## [23] "Skoog, Mr. Wilhelm"                                       
## [24] "Skoog, Miss. Mabel"                                       
## [25] "Skoog, Miss. Margit Elizabeth"                            
## [26] "Skoog, Master. Karl Thorsten"                             
## [27] "Goodwin, Master. William Frederick"                       
## [28] "Goodwin, Miss. Lillian Amy"                               
## [29] "Goodwin, Master. Sidney Leonard"                          
## [30] "Goodwin, Master. Harold Victor"                           
## [31] "Goodwin, Mrs. Frederick (Augusta Tyler)"                  
## [32] "Goodwin, Mr. Charles Edward"                              
## [33] "Sage, Master. Thomas Henry"                               
## [34] "Sage, Miss. Constance Gladys"                             
## [35] "Sage, Mr. Frederick"                                      
## [36] "Sage, Mr. George John Jr"                                 
## [37] "Sage, Miss. Stella Anna"                                  
## [38] "Sage, Mr. Douglas Bullen"                                 
## [39] "Sage, Miss. Dorothy Edith \"Dolly\""

Boxplots y su interpretación

Summary de las edades de los pasajeros a bordo

summary(train$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.42   20.12   28.00   29.70   38.00   80.00     177

Boxplot de las edades de los pasajeros a bordo

boxplot(train$Age, xlab="Edad de los pasajeros a bordo")

Summary de las clases en que viajaron los pasajeros a bordo

summary(train$Pclass)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.309   3.000   3.000

Boxplot de las clases en que viajaron los pasajeros a bordo

boxplot(train$Pclass, xlab="Clases en que viajaron los pasajeros a bordo")

Boxplots de las tarifas

#Gráfico de cuánto pagaron los pasajeros
boxplot(train$Fare, 
          main =" Boxplot del precio pagado por abordar", 
          ylab =" Precio (moneda local)")

Histogramas

Histograma de la edad

hist(na.omit(train$Age),
     main = "Histograma de las edades de los pasajeros",
     xlab = "Edad",
     ylab = "Personas",
     labels = TRUE)

Histograma de las tarifas

#Histograma de los precios de los pasajes
hist( train $ Fare, 
        main = "Histograma de los precios de las tarifas", 
        xlab = "Precio ($)",
        ylab = "Frecuencia")

#skewness(train$Fare)

Varianza

\[\sigma^{2} = \frac{\sum_{i=1}^{n} \left(x_{i} - \bar{x}\right)^{2}} {n-1}\]

#Varianza de la edad de los pasajeros
sqrt(var(train$Age, na.rm=TRUE))
## [1] 14.5265

Desviación Estándar

La desviación estándar mide cuánto se separan los datos.

\[\sigma = \sqrt{\frac{\sum\limits_{i=1}^{n} \left(x_{i} - \bar{x}\right)^{2}} {n-1}}\]

train$Age[is.na(train$Age)] <- mean(train$Age, na.rm = TRUE)
sd(train$Age)
## [1] 13.00202
sd(train$Fare)
## [1] 49.69343

Correlación

Este coeficiente nos permite predecir quá tan exacto se predice o son independientes un conjunto de datos del otro.

train$Age[is.na(train$Age)] <- mean(train$Age, na.rm = TRUE)
plot(train$Age, train$Fare,xlab="Edad del pasajero",ylab="Tarifa pagada")

Correlación

Clase

plot(train$Pclass, train$Fare,xlab="Clase del viaje",ylab="Tarifa pagada")

Variables categóricas

Sobrevivientes

plot(table(train$Survived))

Variables categóricas

Clase del viaje

plot(table(train$Pclass))

Representación de los sobrevivientes respecto al género

Representación de los sobrevivientes respecto a sus clases sociales

Gracias