22 de febrero de 2017
knitr::opts_chunk$set(echo = TRUE) getwd()
## [1] "/Users/julietarodriguez/Downloads/Mini proyecto 1"
train <- read.csv("train.csv", stringsAsFactors = FALSE)
head(train)
## PassengerId Survived Pclass ## 1 1 0 3 ## 2 2 1 1 ## 3 3 1 3 ## 4 4 1 1 ## 5 5 0 3 ## 6 6 0 3 ## Name Sex Age SibSp ## 1 Braund, Mr. Owen Harris male 22 1 ## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 ## 3 Heikkinen, Miss. Laina female 26 0 ## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 ## 5 Allen, Mr. William Henry male 35 0 ## 6 Moran, Mr. James male NA 0 ## Parch Ticket Fare Cabin Embarked ## 1 0 A/5 21171 7.2500 S ## 2 0 PC 17599 71.2833 C85 C ## 3 0 STON/O2. 3101282 7.9250 S ## 4 0 113803 53.1000 C123 S ## 5 0 373450 8.0500 S ## 6 0 330877 8.4583 Q
tail(train)
## PassengerId Survived Pclass Name ## 886 886 0 3 Rice, Mrs. William (Margaret Norton) ## 887 887 0 2 Montvila, Rev. Juozas ## 888 888 1 1 Graham, Miss. Margaret Edith ## 889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" ## 890 890 1 1 Behr, Mr. Karl Howell ## 891 891 0 3 Dooley, Mr. Patrick ## Sex Age SibSp Parch Ticket Fare Cabin Embarked ## 886 female 39 0 5 382652 29.125 Q ## 887 male 27 0 0 211536 13.000 S ## 888 female 19 0 0 112053 30.000 B42 S ## 889 female NA 1 2 W./C. 6607 23.450 S ## 890 male 26 0 0 111369 30.000 C148 C ## 891 male 32 0 0 370376 7.750 Q
str(train)
## 'data.frame': 891 obs. of 12 variables: ## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ... ## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ... ## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ... ## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ... ## $ Sex : chr "male" "female" "female" "female" ... ## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ... ## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ... ## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ... ## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ... ## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ... ## $ Cabin : chr "" "C85" "" "C123" ... ## $ Embarked : chr "S" "C" "S" "S" ...
summary(train)
## PassengerId Survived Pclass Name ## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891 ## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character ## Median :446.0 Median :0.0000 Median :3.000 Mode :character ## Mean :446.0 Mean :0.3838 Mean :2.309 ## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000 ## Max. :891.0 Max. :1.0000 Max. :3.000 ## ## Sex Age SibSp Parch ## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000 ## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000 ## Mode :character Median :28.00 Median :0.000 Median :0.0000 ## Mean :29.70 Mean :0.523 Mean :0.3816 ## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000 ## Max. :80.00 Max. :8.000 Max. :6.0000 ## NA's :177 ## Ticket Fare Cabin Embarked ## Length:891 Min. : 0.00 Length:891 Length:891 ## Class :character 1st Qu.: 7.91 Class :character Class :character ## Mode :character Median : 14.45 Mode :character Mode :character ## Mean : 32.20 ## 3rd Qu.: 31.00 ## Max. :512.33 ##
table(train$Survived)
## ## 0 1 ## 549 342
table(train$Pclass)
## ## 1 2 3 ## 216 184 491
table(train$Sex)
## ## female male ## 314 577
table(train$Sex, train$Survived)
## ## 0 1 ## female 81 233 ## male 468 109
table(train$Parch)
## ## 0 1 2 3 4 5 6 ## 678 118 80 5 4 5 1
length(unique(train$Ticket))
## [1] 681
#C = Cherbourg, Q = Queenstown(Belfast) y S = Southampton. train$Embarked <- as.factor(train$Embarked) table(train$Embarked)
## ## C Q S ## 2 168 77 644
table(train$Survived)
## ## 0 1 ## 549 342
summary(train$Survived)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0000 0.0000 0.0000 0.3838 1.0000 1.0000
table(train$Pclass)
## ## 1 2 3 ## 216 184 491
summary(train$Pclass)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 1.000 2.000 3.000 2.309 3.000 3.000
table(train$Parch)
## ## 0 1 2 3 4 5 6 ## 678 118 80 5 4 5 1
summary(train$Parch)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 0.0000 0.0000 0.0000 0.3816 0.0000 6.0000
ticket <- train[train$Ticket %in% names(table(train$Ticket))[table(train$Ticket) > 5],] ticket[order(ticket$Ticket),]$Name
## [1] "Bing, Mr. Lee" ## [2] "Ling, Mr. Lee" ## [3] "Lang, Mr. Fang" ## [4] "Foo, Mr. Choong" ## [5] "Lam, Mr. Ali" ## [6] "Lam, Mr. Len" ## [7] "Chip, Mr. Chang" ## [8] "Panula, Master. Juha Niilo" ## [9] "Panula, Master. Eino Viljami" ## [10] "Panula, Mr. Ernesti Arvid" ## [11] "Panula, Mrs. Juha (Maria Emilia Ojala)" ## [12] "Panula, Mr. Jaako Arnold" ## [13] "Panula, Master. Urho Abraham" ## [14] "Andersson, Mr. Anders Johan" ## [15] "Andersson, Miss. Ellis Anna Maria" ## [16] "Andersson, Miss. Ingeborg Constanzia" ## [17] "Andersson, Miss. Sigrid Elisabeth" ## [18] "Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)" ## [19] "Andersson, Miss. Ebba Iris Alfrida" ## [20] "Andersson, Master. Sigvard Harald Elias" ## [21] "Skoog, Master. Harald" ## [22] "Skoog, Mrs. William (Anna Bernhardina Karlsson)" ## [23] "Skoog, Mr. Wilhelm" ## [24] "Skoog, Miss. Mabel" ## [25] "Skoog, Miss. Margit Elizabeth" ## [26] "Skoog, Master. Karl Thorsten" ## [27] "Goodwin, Master. William Frederick" ## [28] "Goodwin, Miss. Lillian Amy" ## [29] "Goodwin, Master. Sidney Leonard" ## [30] "Goodwin, Master. Harold Victor" ## [31] "Goodwin, Mrs. Frederick (Augusta Tyler)" ## [32] "Goodwin, Mr. Charles Edward" ## [33] "Sage, Master. Thomas Henry" ## [34] "Sage, Miss. Constance Gladys" ## [35] "Sage, Mr. Frederick" ## [36] "Sage, Mr. George John Jr" ## [37] "Sage, Miss. Stella Anna" ## [38] "Sage, Mr. Douglas Bullen" ## [39] "Sage, Miss. Dorothy Edith \"Dolly\""
summary(train$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## 0.42 20.12 28.00 29.70 38.00 80.00 177
boxplot(train$Age, xlab="Edad de los pasajeros a bordo")
summary(train$Pclass)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## 1.000 2.000 3.000 2.309 3.000 3.000
boxplot(train$Pclass, xlab="Clases en que viajaron los pasajeros a bordo")
#Gráfico de cuánto pagaron los pasajeros
boxplot(train$Fare,
main =" Boxplot del precio pagado por abordar",
ylab =" Precio (moneda local)")
hist(na.omit(train$Age),
main = "Histograma de las edades de los pasajeros",
xlab = "Edad",
ylab = "Personas",
labels = TRUE)
#Histograma de los precios de los pasajes
hist( train $ Fare,
main = "Histograma de los precios de las tarifas",
xlab = "Precio ($)",
ylab = "Frecuencia")
#skewness(train$Fare)
\[\sigma^{2} = \frac{\sum_{i=1}^{n} \left(x_{i} - \bar{x}\right)^{2}} {n-1}\]
#Varianza de la edad de los pasajeros sqrt(var(train$Age, na.rm=TRUE))
## [1] 14.5265
La desviación estándar mide cuánto se separan los datos.
\[\sigma = \sqrt{\frac{\sum\limits_{i=1}^{n} \left(x_{i} - \bar{x}\right)^{2}} {n-1}}\]
train$Age[is.na(train$Age)] <- mean(train$Age, na.rm = TRUE) sd(train$Age)
## [1] 13.00202
sd(train$Fare)
## [1] 49.69343
Este coeficiente nos permite predecir quá tan exacto se predice o son independientes un conjunto de datos del otro.
train$Age[is.na(train$Age)] <- mean(train$Age, na.rm = TRUE) plot(train$Age, train$Fare,xlab="Edad del pasajero",ylab="Tarifa pagada")
Clase
plot(train$Pclass, train$Fare,xlab="Clase del viaje",ylab="Tarifa pagada")
Sobrevivientes
plot(table(train$Survived))
Clase del viaje
plot(table(train$Pclass))